From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Dec 2010 18:50:28 -0500 Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This is a new SIMD library. It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time. --- volk/include/Makefile.am | 23 + volk/include/volk/Makefile.am | 133 ++++++ volk/include/volk/archs.xml | 133 ++++++ volk/include/volk/emit_omnilog.py | 13 + volk/include/volk/make_c.py | 76 ++++ volk/include/volk/make_config_fixed.py | 21 + volk/include/volk/make_config_in.py | 13 + volk/include/volk/make_cpuid_generic_c.py | 38 ++ volk/include/volk/make_cpuid_h.py | 25 ++ volk/include/volk/make_cpuid_powerpc_c.py | 45 ++ volk/include/volk/make_cpuid_x86_c.py | 93 ++++ volk/include/volk/make_environment_init_c.py | 33 ++ volk/include/volk/make_environment_init_h.py | 18 + volk/include/volk/make_h.py | 28 ++ volk/include/volk/make_init_c.py | 42 ++ volk/include/volk/make_init_h.py | 26 ++ volk/include/volk/make_mktables.py | 33 ++ volk/include/volk/make_proccpu_sim.py | 25 ++ volk/include/volk/make_registry.py | 62 +++ volk/include/volk/make_runtime.py | 34 ++ volk/include/volk/make_runtime_c.py | 47 +++ volk/include/volk/make_set_simd.py | 202 +++++++++ volk/include/volk/make_typedefs.py | 23 + volk/include/volk/volk_16s_add_quad_aligned16.h | 136 ++++++ .../volk/volk_16s_branch_4_state_8_aligned16.h | 194 +++++++++ volk/include/volk/volk_16s_convert_32f_aligned16.h | 119 ++++++ .../volk/volk_16s_convert_32f_unaligned16.h | 122 ++++++ volk/include/volk/volk_16s_convert_8s_aligned16.h | 69 +++ .../include/volk/volk_16s_convert_8s_unaligned16.h | 71 ++++ volk/include/volk/volk_16s_max_star_aligned16.h | 108 +++++ .../volk/volk_16s_max_star_horizontal_aligned16.h | 130 ++++++ .../volk_16s_permute_and_scalar_add_aligned16.h | 139 ++++++ .../volk/volk_16s_quad_max_star_aligned16.h | 191 +++++++++ .../volk/volk_16sc_deinterleave_16s_aligned16.h | 146 +++++++ .../volk/volk_16sc_deinterleave_32f_aligned16.h | 95 +++++ .../volk_16sc_deinterleave_real_16s_aligned16.h | 120 ++++++ .../volk_16sc_deinterleave_real_32f_aligned16.h | 125 ++++++ .../volk_16sc_deinterleave_real_8s_aligned16.h | 83 ++++ .../volk/volk_16sc_magnitude_16s_aligned16.h | 179 ++++++++ .../volk/volk_16sc_magnitude_32f_aligned16.h | 163 +++++++ volk/include/volk/volk_16u_byteswap_aligned16.h | 65 +++ volk/include/volk/volk_32f_accumulator_aligned16.h | 67 +++ volk/include/volk/volk_32f_add_aligned16.h | 69 +++ .../volk_32f_calc_spectral_noise_floor_aligned16.h | 167 ++++++++ volk/include/volk/volk_32f_convert_16s_aligned16.h | 110 +++++ .../volk/volk_32f_convert_16s_unaligned16.h | 113 +++++ volk/include/volk/volk_32f_convert_32s_aligned16.h | 106 +++++ .../volk/volk_32f_convert_32s_unaligned16.h | 109 +++++ volk/include/volk/volk_32f_convert_64f_aligned16.h | 70 +++ .../volk/volk_32f_convert_64f_unaligned16.h | 70 +++ volk/include/volk/volk_32f_convert_8s_aligned16.h | 117 ++++++ .../include/volk/volk_32f_convert_8s_unaligned16.h | 120 ++++++ volk/include/volk/volk_32f_divide_aligned16.h | 69 +++ volk/include/volk/volk_32f_dot_prod_aligned16.h | 184 ++++++++ volk/include/volk/volk_32f_dot_prod_unaligned16.h | 184 ++++++++ volk/include/volk/volk_32f_fm_detect_aligned16.h | 120 ++++++ volk/include/volk/volk_32f_index_max_aligned16.h | 148 +++++++ .../volk/volk_32f_interleave_16sc_aligned16.h | 155 +++++++ .../volk/volk_32f_interleave_32fc_aligned16.h | 75 ++++ volk/include/volk/volk_32f_max_aligned16.h | 71 ++++ volk/include/volk/volk_32f_min_aligned16.h | 71 ++++ volk/include/volk/volk_32f_multiply_aligned16.h | 69 +++ volk/include/volk/volk_32f_normalize_aligned16.h | 66 +++ volk/include/volk/volk_32f_power_aligned16.h | 144 +++++++ volk/include/volk/volk_32f_sqrt_aligned16.h | 64 +++ volk/include/volk/volk_32f_stddev_aligned16.h | 144 +++++++ .../volk/volk_32f_stddev_and_mean_aligned16.h | 169 ++++++++ volk/include/volk/volk_32f_subtract_aligned16.h | 67 +++ volk/include/volk/volk_32f_sum_of_poly_aligned16.h | 151 +++++++ .../volk/volk_32fc_32f_multiply_aligned16.h | 82 ++++ .../volk/volk_32fc_32f_power_32fc_aligned16.h | 109 +++++ volk/include/volk/volk_32fc_atan2_32f_aligned16.h | 158 +++++++ .../volk/volk_32fc_conjugate_dot_prod_aligned16.h | 344 +++++++++++++++ .../volk/volk_32fc_deinterleave_32f_aligned16.h | 75 ++++ .../volk/volk_32fc_deinterleave_64f_aligned16.h | 78 ++++ .../volk_32fc_deinterleave_real_16s_aligned16.h | 80 ++++ .../volk_32fc_deinterleave_real_32f_aligned16.h | 68 +++ .../volk_32fc_deinterleave_real_64f_aligned16.h | 66 +++ volk/include/volk/volk_32fc_dot_prod_aligned16.h | 468 +++++++++++++++++++++ volk/include/volk/volk_32fc_index_max_aligned16.h | 215 ++++++++++ .../volk/volk_32fc_magnitude_16s_aligned16.h | 146 +++++++ .../volk/volk_32fc_magnitude_32f_aligned16.h | 121 ++++++ volk/include/volk/volk_32fc_multiply_aligned16.h | 78 ++++ ...olk_32fc_power_spectral_density_32f_aligned16.h | 134 ++++++ .../volk/volk_32fc_power_spectrum_32f_aligned16.h | 126 ++++++ .../include/volk/volk_32fc_square_dist_aligned16.h | 112 +++++ .../volk_32fc_square_dist_scalar_mult_aligned16.h | 126 ++++++ volk/include/volk/volk_32s_and_aligned16.h | 69 +++ volk/include/volk/volk_32s_convert_32f_aligned16.h | 73 ++++ .../volk/volk_32s_convert_32f_unaligned16.h | 75 ++++ volk/include/volk/volk_32s_or_aligned16.h | 69 +++ volk/include/volk/volk_32u_byteswap_aligned16.h | 77 ++++ volk/include/volk/volk_32u_popcnt_aligned16.h | 36 ++ volk/include/volk/volk_64f_convert_32f_aligned16.h | 67 +++ .../volk/volk_64f_convert_32f_unaligned16.h | 67 +++ volk/include/volk/volk_64f_max_aligned16.h | 71 ++++ volk/include/volk/volk_64f_min_aligned16.h | 71 ++++ volk/include/volk/volk_64u_byteswap_aligned16.h | 88 ++++ volk/include/volk/volk_64u_popcnt_aligned16.h | 74 ++++ volk/include/volk/volk_8s_convert_16s_aligned16.h | 71 ++++ .../include/volk/volk_8s_convert_16s_unaligned16.h | 73 ++++ volk/include/volk/volk_8s_convert_32f_aligned16.h | 92 ++++ .../include/volk/volk_8s_convert_32f_unaligned16.h | 94 +++++ .../volk/volk_8sc_deinterleave_16s_aligned16.h | 77 ++++ .../volk/volk_8sc_deinterleave_32f_aligned16.h | 164 ++++++++ .../volk_8sc_deinterleave_real_16s_aligned16.h | 66 +++ .../volk_8sc_deinterleave_real_32f_aligned16.h | 133 ++++++ .../volk/volk_8sc_deinterleave_real_8s_aligned16.h | 67 +++ .../volk_8sc_multiply_conjugate_16sc_aligned16.h | 102 +++++ .../volk_8sc_multiply_conjugate_32fc_aligned16.h | 122 ++++++ volk/include/volk/volk_common.h | 18 + volk/include/volk/volk_complex.h | 71 ++++ volk/include/volk/volk_regexp.py | 8 + volk/include/volk/volk_register.py | 279 ++++++++++++ 114 files changed, 11370 insertions(+) create mode 100644 volk/include/Makefile.am create mode 100644 volk/include/volk/Makefile.am create mode 100644 volk/include/volk/archs.xml create mode 100644 volk/include/volk/emit_omnilog.py create mode 100644 volk/include/volk/make_c.py create mode 100644 volk/include/volk/make_config_fixed.py create mode 100644 volk/include/volk/make_config_in.py create mode 100644 volk/include/volk/make_cpuid_generic_c.py create mode 100644 volk/include/volk/make_cpuid_h.py create mode 100644 volk/include/volk/make_cpuid_powerpc_c.py create mode 100644 volk/include/volk/make_cpuid_x86_c.py create mode 100644 volk/include/volk/make_environment_init_c.py create mode 100644 volk/include/volk/make_environment_init_h.py create mode 100644 volk/include/volk/make_h.py create mode 100644 volk/include/volk/make_init_c.py create mode 100644 volk/include/volk/make_init_h.py create mode 100644 volk/include/volk/make_mktables.py create mode 100644 volk/include/volk/make_proccpu_sim.py create mode 100644 volk/include/volk/make_registry.py create mode 100644 volk/include/volk/make_runtime.py create mode 100644 volk/include/volk/make_runtime_c.py create mode 100644 volk/include/volk/make_set_simd.py create mode 100644 volk/include/volk/make_typedefs.py create mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h create mode 100644 volk/include/volk/volk_32f_add_aligned16.h create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_divide_aligned16.h create mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h create mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h create mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h create mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h create mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32f_max_aligned16.h create mode 100644 volk/include/volk/volk_32f_min_aligned16.h create mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h create mode 100644 volk/include/volk/volk_32f_power_aligned16.h create mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h create mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h create mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h create mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/include/volk/volk_32s_and_aligned16.h create mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_32s_or_aligned16.h create mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_64f_max_aligned16.h create mode 100644 volk/include/volk/volk_64f_min_aligned16.h create mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h create mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/include/volk/volk_common.h create mode 100644 volk/include/volk/volk_complex.h create mode 100644 volk/include/volk/volk_regexp.py create mode 100755 volk/include/volk/volk_register.py (limited to 'volk/include') diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am new file mode 100644 index 000000000..375d1a7d5 --- /dev/null +++ b/volk/include/Makefile.am @@ -0,0 +1,23 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +SUBDIRS = volk diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am new file mode 100644 index 000000000..fd59ab795 --- /dev/null +++ b/volk/include/volk/Makefile.am @@ -0,0 +1,133 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +ourinclude_HEADERS = \ + volk_complex.h \ + volk_common.h \ + volk_config_fixed.h \ + volk_runtime.h \ + volk_config.h \ + volk_tables.h \ + volk_typedefs.h \ + volk.h \ + volk_cpu.h \ + volk_environment_init.h \ + volk_16s_add_quad_aligned16.h \ + volk_16s_branch_4_state_8_aligned16.h \ + volk_16sc_deinterleave_16s_aligned16.h \ + volk_16sc_deinterleave_32f_aligned16.h \ + volk_16sc_deinterleave_real_16s_aligned16.h \ + volk_16sc_deinterleave_real_32f_aligned16.h \ + volk_16sc_deinterleave_real_8s_aligned16.h \ + volk_16sc_magnitude_16s_aligned16.h \ + volk_16sc_magnitude_32f_aligned16.h \ + volk_16s_convert_32f_aligned16.h \ + volk_16s_convert_32f_unaligned16.h \ + volk_16s_convert_8s_aligned16.h \ + volk_16s_convert_8s_unaligned16.h \ + volk_16s_max_star_aligned16.h \ + volk_16s_max_star_horizontal_aligned16.h \ + volk_16s_permute_and_scalar_add_aligned16.h \ + volk_16s_quad_max_star_aligned16.h \ + volk_16u_byteswap_aligned16.h \ + volk_32f_accumulator_aligned16.h \ + volk_32f_add_aligned16.h \ + volk_32fc_32f_multiply_aligned16.h \ + volk_32fc_32f_power_32fc_aligned16.h \ + volk_32f_calc_spectral_noise_floor_aligned16.h \ + volk_32fc_atan2_32f_aligned16.h \ + volk_32fc_conjugate_dot_prod_aligned16.h \ + volk_32fc_deinterleave_32f_aligned16.h \ + volk_32fc_deinterleave_64f_aligned16.h \ + volk_32fc_deinterleave_real_16s_aligned16.h \ + volk_32fc_deinterleave_real_32f_aligned16.h \ + volk_32fc_deinterleave_real_64f_aligned16.h \ + volk_32fc_dot_prod_aligned16.h \ + volk_32fc_index_max_aligned16.h \ + volk_32fc_magnitude_16s_aligned16.h \ + volk_32fc_magnitude_32f_aligned16.h \ + volk_32fc_multiply_aligned16.h \ + volk_32f_convert_16s_aligned16.h \ + volk_32f_convert_16s_unaligned16.h \ + volk_32f_convert_32s_aligned16.h \ + volk_32f_convert_32s_unaligned16.h \ + volk_32f_convert_64f_aligned16.h \ + volk_32f_convert_64f_unaligned16.h \ + volk_32f_convert_8s_aligned16.h \ + volk_32f_convert_8s_unaligned16.h \ + volk_32fc_power_spectral_density_32f_aligned16.h \ + volk_32fc_power_spectrum_32f_aligned16.h \ + volk_32fc_square_dist_aligned16.h \ + volk_32fc_square_dist_scalar_mult_aligned16.h \ + volk_32f_divide_aligned16.h \ + volk_32f_dot_prod_aligned16.h \ + volk_32f_dot_prod_unaligned16.h \ + volk_32f_fm_detect_aligned16.h \ + volk_32f_index_max_aligned16.h \ + volk_32f_interleave_16sc_aligned16.h \ + volk_32f_interleave_32fc_aligned16.h \ + volk_32f_max_aligned16.h \ + volk_32f_min_aligned16.h \ + volk_32f_multiply_aligned16.h \ + volk_32f_normalize_aligned16.h \ + volk_32f_power_aligned16.h \ + volk_32f_sqrt_aligned16.h \ + volk_32f_stddev_aligned16.h \ + volk_32f_stddev_and_mean_aligned16.h \ + volk_32f_subtract_aligned16.h \ + volk_32f_sum_of_poly_aligned16.h \ + volk_32s_and_aligned16.h \ + volk_32s_convert_32f_aligned16.h \ + volk_32s_convert_32f_unaligned16.h \ + volk_32s_or_aligned16.h \ + volk_32u_byteswap_aligned16.h \ + volk_32u_popcnt_aligned16.h \ + volk_64f_convert_32f_aligned16.h \ + volk_64f_convert_32f_unaligned16.h \ + volk_64f_max_aligned16.h \ + volk_64f_min_aligned16.h \ + volk_64u_byteswap_aligned16.h \ + volk_64u_popcnt_aligned16.h \ + volk_8sc_deinterleave_16s_aligned16.h \ + volk_8sc_deinterleave_32f_aligned16.h \ + volk_8sc_deinterleave_real_16s_aligned16.h \ + volk_8sc_deinterleave_real_32f_aligned16.h \ + volk_8sc_deinterleave_real_8s_aligned16.h \ + volk_8sc_multiply_conjugate_16sc_aligned16.h \ + volk_8sc_multiply_conjugate_32fc_aligned16.h \ + volk_8s_convert_16s_aligned16.h \ + volk_8s_convert_16s_unaligned16.h \ + volk_8s_convert_32f_aligned16.h \ + volk_8s_convert_32f_unaligned16.h + +distclean-local: + rm -f volk_config_fixed.h + rm -f volk_config.h + rm -f volk_cpu.h + rm -f volk.h + rm -f volk_registry.h + rm -f volk_runtime.h + rm -f volk_typedefs.h + rm -f volk_tables.h + rm -f *.pyc + rm -f Makefile.in + rm -f volk_environment_init.h \ No newline at end of file diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml new file mode 100644 index 000000000..c9c441008 --- /dev/null +++ b/volk/include/volk/archs.xml @@ -0,0 +1,133 @@ + + + + + none + + + + maltivec + + + + 0x80000001 + m32 + d + 29 + 0 + + + + 0x80000001 + d + 29 + m64 + 1 + + + + 0x80000001 + d + 31 + m3dnow + 1 + + + + 1 + 0x80000001 + d + 5 + sse4.2 + + + + 1 + 1 + c + 23 + mpopcnt + + + + 1 + 1 + d + 23 + mmmx + + + + + 1 + 1 + d + 25 + msse + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + xmmintrin.h + + + + + 1 + 1 + d + 26 + msse2 + + + + 1 + 1 + c + 0 + msse3 + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + pmmintrin.h + + + + 1 + 1 + c + 9 + mssse3 + + + + 1 + 0x80000001 + c + 6 + msse4a + + + + + 1 + 1 + c + 19 + msse4.1 + + + + 1 + 1 + c + 20 + msse4.2 + + + + + 1 + 1 + c + 28 + mavx + + + + diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py new file mode 100644 index 000000000..309d7e578 --- /dev/null +++ b/volk/include/volk/emit_omnilog.py @@ -0,0 +1,13 @@ +def emit_prolog(): + tempstring = ""; + tempstring = tempstring + '#ifdef __cplusplus\n'; + tempstring = tempstring + 'extern "C" {\n'; + tempstring = tempstring + '#endif\n'; + return tempstring; +def emit_epilog(): + tempstring = ""; + tempstring = tempstring + '#ifdef __cplusplus\n'; + tempstring = tempstring + '}\n'; + tempstring = tempstring + '#endif\n'; + return tempstring; + diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py new file mode 100644 index 000000000..08d6d949c --- /dev/null +++ b/volk/include/volk/make_c.py @@ -0,0 +1,76 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + +def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + for func in funclist: + tempstring = tempstring + "#include\n" ; + tempstring = tempstring + '\n'; + + tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n"; + tempstring = tempstring + " int i = 1;\n" + tempstring = tempstring + " for(;i\n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + for domarch in dom: + if str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py new file mode 100644 index 000000000..823e3b2c0 --- /dev/null +++ b/volk/include/volk/make_cpuid_h.py @@ -0,0 +1,25 @@ +from xml.dom import minidom +from emit_omnilog import * + +def make_cpuid_h(dom) : + tempstring = ""; + tempstring = tempstring +'/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H'; + tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n'; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + '\n' + + tempstring = tempstring + "struct VOLK_CPU {\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " int (*has_" + arch + ") ();\n"; + tempstring = tempstring + "}volk_cpu;\n\n"; + + tempstring = tempstring + "void volk_cpu_init ();\n" + tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" + + tempstring = tempstring + "\n"; + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n" + + return tempstring; diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py new file mode 100644 index 000000000..443a58488 --- /dev/null +++ b/volk/include/volk/make_cpuid_powerpc_c.py @@ -0,0 +1,45 @@ +from xml.dom import minidom + +def make_cpuid_powerpc_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + #just assume it has them for powerpc + for domarch in dom: + if str(domarch.attributes["type"].value) == "powerpc": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + + tempstring = tempstring + "}\n\n" + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py new file mode 100644 index 000000000..fd83e5e5c --- /dev/null +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -0,0 +1,93 @@ +from xml.dom import minidom + +def make_cpuid_x86_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" + tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[0];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[1];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[2];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[3];\n" + tempstring = tempstring + "}\n\n"; + + for domarch in dom: + if str(domarch.attributes["type"].value) == "x86": + arch = str(domarch.attributes["name"].value); + op = domarch.getElementsByTagName("op"); + op = str(op[0].firstChild.data); + reg = domarch.getElementsByTagName("reg"); + reg = str(reg[0].firstChild.data); + shift = domarch.getElementsByTagName("shift"); + shift = str(shift[0].firstChild.data); + val = domarch.getElementsByTagName("val"); + val = str(val[0].firstChild.data); + + if op == "1": + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" + tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" + tempstring = tempstring + "}\n\n"; + + elif op == "0x80000001": + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " unsigned int extended_fct_count = cpuid_eax(0x80000000);\n"; + tempstring = tempstring + " if (extended_fct_count < 0x80000001)\n"; + tempstring = tempstring + " return "+ val + "^1;\n\n" + tempstring = tempstring + " unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n"; + tempstring = tempstring + " return ((extended_features >> " + shift + ") & 1) == " + val + ";\n" + tempstring = tempstring + "}\n\n"; + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + + + + + + + diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py new file mode 100644 index 000000000..e06c7f246 --- /dev/null +++ b/volk/include/volk/make_environment_init_c.py @@ -0,0 +1,33 @@ +from xml.dom import minidom + +def make_environment_init_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include\n" + tempstring = tempstring + "#include\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + incs = domarch.getElementsByTagName("include"); + for inc in incs: + my_inc = str(inc.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#include<" + my_inc + ">\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + '\n\n'; + tempstring = tempstring + "void volk_environment_init(){\n" + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + envs = domarch.getElementsByTagName("environment"); + for env in envs: + cmd = str(env.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + " " + cmd + "\n"; + tempstring = tempstring + "#endif\n" + + tempstring = tempstring + "}\n"; + return tempstring; + + + + diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py new file mode 100644 index 000000000..77a841a24 --- /dev/null +++ b/volk/include/volk/make_environment_init_h.py @@ -0,0 +1,18 @@ +from xml.dom import minidom +from emit_omnilog import * + +def make_environment_init_h() : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; + tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; + tempstring = tempstring + "\n"; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + "void volk_environment_init();\n"; + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif\n" + return tempstring; + + + + diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py new file mode 100644 index 000000000..81d9ad401 --- /dev/null +++ b/volk/include/volk/make_h.py @@ -0,0 +1,28 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_h(funclist, arched_arglist, retlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n' + tempstring = tempstring + emit_prolog() + tempstring = tempstring + '\n'; + + for i in range(len(retlist)): + tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; + tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n'; + + tempstring = tempstring + emit_epilog(); + + tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n"; + + return tempstring; diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py new file mode 100644 index 000000000..330e19592 --- /dev/null +++ b/volk/include/volk/make_init_c.py @@ -0,0 +1,42 @@ +from xml.dom import minidom + +def make_init_c(funclist, dom) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + incs = domarch.getElementsByTagName("include"); + for inc in incs: + my_inc = str(inc.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#include<" + my_inc + ">\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + '\n\n'; + + tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n"; + tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n"; + tempstring = tempstring + " return &volk_runtime;\n"; + tempstring = tempstring + "}\n\n" + tempstring = tempstring + " void volk_runtime_init() {\nvolk_cpu_init();\n"; + + for func in funclist: + tempstring = tempstring + " volk_runtime." + func + " = default_acquire_" + func + ";\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + envs = domarch.getElementsByTagName("environment"); + for env in envs: + cmd = str(env.firstChild.data); + tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + " " + cmd + "\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + " }\n"; + + tempstring = tempstring + "}\n"; + + return tempstring diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py new file mode 100644 index 000000000..6dbe1c585 --- /dev/null +++ b/volk/include/volk/make_init_h.py @@ -0,0 +1,26 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_init_h(funclist, arched_arglist, retlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + + tempstring = tempstring + '\n'; + + tempstring = tempstring + emit_prolog(); + + for i in range(len(retlist)): + tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; + + tempstring= tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n"; + + return tempstring; diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py new file mode 100644 index 000000000..051ac268d --- /dev/null +++ b/volk/include/volk/make_mktables.py @@ -0,0 +1,33 @@ + + +def make_mktables(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstrgin = tempstring + '#include\n'; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + 'int main() {\n'; + tempstring = tempstring + ' int i = 0;\n'; + tempstring = tempstring + ' FILE* output;\n'; + tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; + tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; + tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; + + for func in funclist: + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; + tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; + tempstring = tempstring + ' fclose(output);\n' + tempstring = tempstring + '}\n'; + return tempstring; + + + + + + + + diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py new file mode 100644 index 000000000..c75a4d5fb --- /dev/null +++ b/volk/include/volk/make_proccpu_sim.py @@ -0,0 +1,25 @@ +from xml.dom import minidom + +def make_proccpu_sim(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; + tempstring = tempstring + " if(val==1){\n"; + tempstring = tempstring + " sprintf(buf, \"%s %s\", buf, newkey);\n"; + tempstring = tempstring + " }\n"; + tempstring = tempstring + "}\n"; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + "int main() {\n"; + tempstring = tempstring + " volk_cpu_init();\n"; + tempstring = tempstring + " char buf[2048];\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n" + tempstring = tempstring + " printf(\"%s\\n\", buf);\n" + tempstring = tempstring + "}\n" + return tempstring; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py new file mode 100644 index 000000000..8457d61f3 --- /dev/null +++ b/volk/include/volk/make_registry.py @@ -0,0 +1,62 @@ +from xml.dom import minidom +from emit_omnilog import * +import string + +def make_registry(dom, funclist, fcountlist) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; + tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; + tempstring = tempstring +'#include\n'; + tempstring = tempstring +'#include\n'; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + '\n' + + + + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; + tempstring = tempstring +"#else\n"; + tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; + tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; + + counter = 0; + for fcount in fcountlist: + tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; + counter = counter + 1; + for arch_list in fcount: + tempstring = tempstring + " (LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind] + "_CNT"; + if ind < len(arch_list) - 1: + tempstring = tempstring + " * LV_"; + tempstring = tempstring + ") + "; + lindex = tempstring.rfind(" + "); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); + tempstring = tempstring + ",\n" + for arch_list in fcount: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + "\n" + tempstring = tempstring + " (1 << LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + ") + (1 << LV_" + tempstring = tempstring + "),\n#endif\n" + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n" + + + tempstring = tempstring + emit_epilog(); + tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n"; + + return tempstring; + diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py new file mode 100644 index 000000000..645b3aaee --- /dev/null +++ b/volk/include/volk/make_runtime.py @@ -0,0 +1,34 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_runtime(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + emit_prolog(); + + tempstring = tempstring + '\n'; + + tempstring = tempstring + "struct VOLK_RUNTIME {\n"; + + for i in range(len(funclist)): + tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n"; + tempstring = tempstring + "};\n\n"; + + tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n" + tempstring = tempstring + "\nvoid volk_runtime_init();\n"; + + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; + + return tempstring; + diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py new file mode 100644 index 000000000..070df9ba7 --- /dev/null +++ b/volk/include/volk/make_runtime_c.py @@ -0,0 +1,47 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + +def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + "#include\n"; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + + for func in funclist: + tempstring = tempstring + "#include\n" ; + tempstring = tempstring + '\n'; + + tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n"; + + for i in range(len(funclist)): + tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; + + tags_counter = 0; + for arch_list in fcountlist[i]: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + + tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; + tags_counter = tags_counter + 1; + + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n"; + + + tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; + tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n'; + + return tempstring; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py new file mode 100644 index 000000000..e320bc748 --- /dev/null +++ b/volk/include/volk/make_set_simd.py @@ -0,0 +1,202 @@ +from xml.dom import minidom + +def make_set_simd(dom) : + tempstring = ""; + tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; + + tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; + tempstring = tempstring + " case \"$MD_CPU\" in\n"; + tempstring = tempstring + " (x86)\n"; + tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; + tempstring = tempstring + " (x86)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " (*)\n" + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " esac\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " (powerpc)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " (*)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " esac\n"; + tempstring = tempstring + "])\n" + + for domarch in dom: + if str(domarch.attributes["type"].value) != "all": + arch = str(domarch.attributes["name"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " LF_CHECK_CC_FLAG([-" + flag + "])\n"; + tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; + tempstring = tempstring + "])\n"; + + tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; + tempstring = tempstring + " indCC=no\n"; + tempstring = tempstring + " indCXX=no\n"; + tempstring = tempstring + " indLV_ARCH=no\n"; + tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n"; + tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list("; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + arch + ", " + tempstring = tempstring[0:len(tempstring) - 2]; + + tempstring = tempstring + ")]),\n"; + tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n"; + tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; + if str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; + tempstring = tempstring + " ADDONS=\"\"\n"; + tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; + tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; + + tempstring = tempstring + " fi\n"; + tempstring = tempstring + " echo $cf_with_lv_arch\n"; + for domarch in dom: + if str(domarch.attributes["type"].value) != "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; + + tempstring = tempstring + " case \"$MD_CPU\" in\n"; + tempstring = tempstring + " (x86)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + if atype == "x86": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "x86": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + elif atype == "all": + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " ;;\n" + + tempstring = tempstring + " (powerpc)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + if atype == "powerpc": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "powerpc": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + elif atype == "all": + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " esac\n" + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" + tempstring = tempstring + "])\n" + + return tempstring; + + + diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py new file mode 100644 index 000000000..fe81cb2b0 --- /dev/null +++ b/volk/include/volk/make_typedefs.py @@ -0,0 +1,23 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + + +def make_typedefs(funclist, retlist, my_argtypelist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + + tempstring = tempstring + '\n'; + + for i in range(len(funclist)): + tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n"; + + tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; + + return tempstring; diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h new file mode 100644 index 000000000..63042bef1 --- /dev/null +++ b/volk/include/volk/volk_16s_add_quad_aligned16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H +#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t" + ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..fb9d7cb87 --- /dev/null +++ b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h new file mode 100644 index 000000000..126ce1528 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_32f_aligned16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h new file mode 100644 index 000000000..d6212fba5 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_32f_unaligned16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h new file mode 100644 index 000000000..64c368688 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h new file mode 100644 index 000000000..ca925de86 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_unaligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h new file mode 100644 index 000000000..ba4e979ec --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_aligned16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H +#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h new file mode 100644 index 000000000..82d011677 --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H +#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..452d05c4f --- /dev/null +++ b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..1004c4d23 --- /dev/null +++ b/volk/include/volk/volk_16s_quad_max_star_aligned16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16s_quad_max_star_aligned16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t" + + "volk_16s_quad_max_star_aligned16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..32e13df98 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h @@ -0,0 +1,146 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..86f67437d --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..b594c85b8 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..3e7be1e64 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..c0d1e941a --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..1482ab82e --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32786.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..29c58ceb8 --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -0,0 +1,163 @@ +#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H +#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h new file mode 100644 index 000000000..698e958e4 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -0,0 +1,65 @@ +#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h new file mode 100644 index 000000000..7e395cf50 --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H +#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float tempBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h new file mode 100644 index 000000000..721c60fd6 --- /dev/null +++ b/volk/include/volk/volk_32f_add_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H +#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h new file mode 100644 index 000000000..ff917525f --- /dev/null +++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H +#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h new file mode 100644 index 000000000..7fbabd9c3 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_16s_aligned16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h new file mode 100644 index 000000000..d2bbdf13a --- /dev/null +++ b/volk/include/volk/volk_32f_convert_16s_unaligned16.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h new file mode 100644 index 000000000..011ef5d0e --- /dev/null +++ b/volk/include/volk/volk_32f_convert_32s_aligned16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h new file mode 100644 index 000000000..a6df826c7 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_32s_unaligned16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h new file mode 100644 index 000000000..91a855813 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_aligned16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h new file mode 100644 index 000000000..698e0d446 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_unaligned16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h new file mode 100644 index 000000000..b9487b622 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_8s_aligned16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h new file mode 100644 index 000000000..e986dbc87 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_8s_unaligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h new file mode 100644 index 000000000..c00700cd8 --- /dev/null +++ b/volk/include/volk/volk_32f_divide_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H +#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h new file mode 100644 index 000000000..3aee1136a --- /dev/null +++ b/volk/include/volk/volk_32f_dot_prod_aligned16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h new file mode 100644 index 000000000..bce6aa15f --- /dev/null +++ b/volk/include/volk/volk_32f_dot_prod_unaligned16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H +#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..c82239d74 --- /dev/null +++ b/volk/include/volk/volk_32f_fm_detect_aligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H +#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h new file mode 100644 index 000000000..26322bfa2 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_aligned16.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#if LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_GENERIC +static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h new file mode 100644 index 000000000..476946b88 --- /dev/null +++ b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H +#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h new file mode 100644 index 000000000..859c6a0ef --- /dev/null +++ b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H +#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h new file mode 100644 index 000000000..96aafb2bf --- /dev/null +++ b/volk/include/volk/volk_32f_max_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h new file mode 100644 index 000000000..e247f4213 --- /dev/null +++ b/volk/include/volk/volk_32f_min_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H +#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h new file mode 100644 index 000000000..b557580ab --- /dev/null +++ b/volk/include/volk/volk_32f_multiply_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h new file mode 100644 index 000000000..1aabb1d9d --- /dev/null +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H +#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h new file mode 100644 index 000000000..2ecd8eecb --- /dev/null +++ b/volk/include/volk/volk_32f_power_aligned16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H +#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h new file mode 100644 index 000000000..0b2eaf251 --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_aligned16.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H +#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h new file mode 100644 index 000000000..1c6a08437 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_aligned16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H +#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h new file mode 100644 index 000000000..1cd502257 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H +#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h new file mode 100644 index 000000000..ac3f5e5d1 --- /dev/null +++ b/volk/include/volk/volk_32f_subtract_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H +#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h new file mode 100644 index 000000000..a326e62b1 --- /dev/null +++ b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H +#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h new file mode 100644 index 000000000..436656ca0 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h new file mode 100644 index 000000000..2d71ee4f8 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H +#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h new file mode 100644 index 000000000..df0ebb987 --- /dev/null +++ b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H +#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h new file mode 100644 index 000000000..60103c1b5 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..02085cd1e --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h new file mode 100644 index 000000000..3d9ebccdd --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..3026b2422 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..2af973bcc --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h new file mode 100644 index 000000000..f408589c4 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h new file mode 100644 index 000000000..1a834dc25 --- /dev/null +++ b/volk/include/volk/volk_32fc_dot_prod_aligned16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h new file mode 100644 index 000000000..d77a95f90 --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_aligned16.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..4e64d8c22 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h @@ -0,0 +1,146 @@ +#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..7a8fd1ef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h @@ -0,0 +1,121 @@ +#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h new file mode 100644 index 000000000..6a1649fdb --- /dev/null +++ b/volk/include/volk/volk_32fc_multiply_aligned16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..52ec0f95b --- /dev/null +++ b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h new file mode 100644 index 000000000..645629b9d --- /dev/null +++ b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h new file mode 100644 index 000000000..6458ea4dd --- /dev/null +++ b/volk/include/volk/volk_32fc_square_dist_aligned16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H +#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h new file mode 100644 index 000000000..0fcc86f1e --- /dev/null +++ b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H +#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h new file mode 100644 index 000000000..e9f1e3a43 --- /dev/null +++ b/volk/include/volk/volk_32s_and_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H +#define INCLUDED_VOLK_32s_AND_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h new file mode 100644 index 000000000..a407e68bd --- /dev/null +++ b/volk/include/volk/volk_32s_convert_32f_aligned16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h new file mode 100644 index 000000000..ad7d4eb17 --- /dev/null +++ b/volk/include/volk/volk_32s_convert_32f_unaligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h new file mode 100644 index 000000000..f4c427c4d --- /dev/null +++ b/volk/include/volk/volk_32s_or_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H +#define INCLUDED_VOLK_32s_OR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h new file mode 100644 index 000000000..09173a9d5 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_aligned16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h new file mode 100644 index 000000000..37cfd112c --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_aligned16.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H +#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h new file mode 100644 index 000000000..44df66104 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h new file mode 100644 index 000000000..08cfb6127 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_unaligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h new file mode 100644 index 000000000..ce4907a8c --- /dev/null +++ b/volk/include/volk/volk_64f_max_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H +#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h new file mode 100644 index 000000000..acf4d6b2a --- /dev/null +++ b/volk/include/volk/volk_64f_min_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H +#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h new file mode 100644 index 000000000..d5e1b6f30 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_aligned16.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h new file mode 100644 index 000000000..e8e1396e7 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_aligned16.h @@ -0,0 +1,74 @@ +#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H +#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { +#if LV_64 + *ret = _mm_popcnt_u64(value); +#else + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +#endif +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h new file mode 100644 index 000000000..0efe3c6a1 --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h new file mode 100644 index 000000000..05b916cea --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_unaligned16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h new file mode 100644 index 000000000..54b66ef8f --- /dev/null +++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h @@ -0,0 +1,92 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h new file mode 100644 index 000000000..8019aac9a --- /dev/null +++ b/volk/include/volk/volk_8s_convert_32f_unaligned16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..38eaa49ea --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..d0c118965 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..d0cb49494 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..c849448ea --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..d84d64568 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h new file mode 100644 index 000000000..470a67539 --- /dev/null +++ b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H +#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h new file mode 100644 index 000000000..52b444cf7 --- /dev/null +++ b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H +#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h new file mode 100644 index 000000000..6f444ad89 --- /dev/null +++ b/volk/include/volk/volk_common.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_LIBVECTOR_COMMON_H +#define INCLUDED_LIBVECTOR_COMMON_H + +#include +#if LV_HAVE_MMX +#include +union bit128{ + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; + __m128i int_vec; + __m128 float_vec; + __m128d double_vec; +}; +#endif /*LV_HAVE_MMX*/ + +#endif /*INCLUDED_LIBVECTOR_COMMON_H*/ diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h new file mode 100644 index 000000000..b20b5cf47 --- /dev/null +++ b/volk/include/volk/volk_complex.h @@ -0,0 +1,71 @@ +#ifndef INCLUDE_VOLK_COMPLEX_H +#define INCLUDE_VOLK_COMPLEX_H + +/*! + \brief This header file is to prevent issues with having and variables in the same code as the gcc compiler does not allow that +*/ +#ifdef __cplusplus + +#include +#include + +typedef std::complex lv_8sc_t; +typedef std::complex lv_16sc_t; +typedef std::complex lv_32sc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; + +static inline float lv_creal(const lv_32fc_t x){ + return x.real(); +} + +static inline float lv_cimag(const lv_32fc_t x){ + return x.imag(); +} + +static inline lv_32fc_t lv_conj(const lv_32fc_t x){ + return std::conj(x); +} + +static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ + return std::pow(x, y); +} + +static inline lv_32fc_t lv_32fc_init(const float x, const float y){ + return std::complex(x,y); +} + +#else + +#include + +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef int complex lv_32sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; + +static inline float lv_creal(const lv_32fc_t x){ + return creal(x); +} + +static inline float lv_cimag(const lv_32fc_t x){ + return cimag(x); +} + +static inline lv_32fc_t lv_conj(const lv_32fc_t x){ + return conj(x); +} + +static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ + return cpow(x, y); +} + +static inline lv_32fc_t lv_32fc_init(const float x, const float y){ + return x + I*y; +} + +#endif + + +#endif /* INCLUDE_VOLK_COMPLEX_H */ diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py new file mode 100644 index 000000000..7b695cb3b --- /dev/null +++ b/volk/include/volk/volk_regexp.py @@ -0,0 +1,8 @@ +import re + +remove_after_underscore = re.compile("_.*"); +space_remove = re.compile(" "); +leading_space_remove = re.compile("^ *"); +replace_arch = re.compile(", const char\* arch"); +replace_bracket = re.compile(" {"); +replace_volk = re.compile("volk"); diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py new file mode 100755 index 000000000..d00501a93 --- /dev/null +++ b/volk/include/volk/volk_register.py @@ -0,0 +1,279 @@ +#! /usr/bin/python + +import sys +import re +import string +from xml.dom import minidom +from volk_regexp import * +from make_cpuid_x86_c import make_cpuid_x86_c +from make_cpuid_h import make_cpuid_h +from make_proccpu_sim import make_proccpu_sim +from make_set_simd import make_set_simd +from make_cpuid_generic_c import make_cpuid_generic_c +from make_cpuid_powerpc_c import make_cpuid_powerpc_c +from make_registry import make_registry +from make_h import make_h +from make_init_h import make_init_h +from make_config_fixed import make_config_fixed +from make_config_in import make_config_in +from make_c import make_c +from make_runtime_c import make_runtime_c +from make_init_c import make_init_c +from make_runtime import make_runtime +from make_typedefs import make_typedefs +from make_mktables import make_mktables +from make_environment_init_c import make_environment_init_c +from make_environment_init_h import make_environment_init_h + +outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); +outfile_reg = open("volk_registry.h", "w"); +outfile_h = open("volk.h", "w"); +outfile_c = open("../../lib/volk.c", "w"); +outfile_runtime = open("volk_runtime.h", "w"); +outfile_runtime_c = open("../../lib/volk_runtime.c", "w"); +outfile_typedefs = open("volk_typedefs.h", "w"); +outfile_init_h = open("../../lib/volk_init.h", "w"); +outfile_init_c = open("../../lib/volk_init.c", "w"); +outfile_cpu_h = open("volk_cpu.h", "w"); +outfile_cpu_x86_c = open("../../lib/volk_cpu_x86.c", "w"); +outfile_cpu_generic_c = open("../../lib/volk_cpu_generic.c", "w"); +outfile_cpu_powerpc_c = open("../../lib/volk_cpu_powerpc.c", "w"); +outfile_proccpu_sim = open("../../lib/volk_proccpu_sim.c", "w"); +outfile_config_in = open("../../volk_config.h.in", "w"); +outfile_config_fixed = open("volk_config_fixed.h", "w"); +outfile_mktables = open("../../lib/volk_mktables.c", "w"); +outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); +outfile_environment_h = open("volk_environment_init.h", "w"); +infile = open("Makefile.am", "r"); + + +mfile = infile.readlines(); + +datatypes = []; +functions = []; + + + +for line in mfile: + subline = re.search(".*(aligned).*", line); + if subline: + subsubline = re.search("(?<=volk_).*", subline.group(0)); + if subsubline: + dtype = remove_after_underscore.sub("", subsubline.group(0)); + subdtype = re.search("[0-9]+[A-z]+", dtype); + if subdtype: + datatypes.append(subdtype.group(0)); + + +datatypes = set(datatypes); + +for line in mfile: + for dt in datatypes: + if dt in line: + subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line); + if subline: + + subsubline = re.search(".+(?=\.h)", subline.group(0)); + + functions.append(subsubline.group(0)); + +archs = []; +afile = minidom.parse("archs.xml"); +filearchs = afile.getElementsByTagName("arch"); +for filearch in filearchs: + archs.append(str(filearch.attributes["name"].value)); + +for arch in archs: + a_var = re.search("^\$", arch); + if a_var: + archs.remove(arch); + + + +archs_or = "(" +for arch in archs: + archs_or = archs_or + string.upper(arch) + "|"; +archs_or = archs_or[0:len(archs_or)-1]; +archs_or = archs_or + ")"; + + +taglist = []; +fcountlist = []; +arched_arglist = []; +retlist = []; +my_arglist = []; +my_argtypelist = []; +for func in functions: + tags = []; + fcount = []; + infile_source = open(func + ".h"); + begun_name = 0; + begun_paren = 0; + sourcefile = infile_source.readlines(); + infile_source.close(); + for line in sourcefile: + + archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); + if archline: + arch = archline.group(0); + archline = re.findall(archs_or + "(?=( |\n|&))", line); + if archline: + archsublist = []; + for tup in archline: + archsublist.append(tup[0]); + fcount.append(archsublist); + testline = re.search("static inline.*?" + func, line); + if (not testline): + continue + tagline = re.search(func + "_.+", line); + if tagline: + tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); + if tag: + tag = re.search("\w+", tag.group(0)); + if tag: + tags.append(tag.group(0)); + + + if begun_name == 0: + retline = re.search(".+(?=" + func + ")", line); + if retline: + ret = retline.group(0); + + + + + subline = re.search(func + ".*", line); + if subline: + subsubline = re.search("\(.*?\)", subline.group(0)); + if subsubline: + args = subsubline.group(0); + + else: + begun_name = 1; + subsubline = re.search("\(.*", subline.group(0)); + if subsubline: + args = subsubline.group(0); + begun_paren = 1; + else: + if begun_paren == 1: + subline = re.search(".*?\)", line); + if subline: + args = args + subline.group(0); + begun_name = 0; + begun_paren = 0; + else: + subline = re.search(".*", line); + args = args + subline.group(0); + else: + subline = re.search("\(.*?\)", line); + if subline: + args = subline.group(0); + begun_name = 0; + else: + subline = re.search("\(.*", line); + if subline: + args = subline.group(0); + begun_paren = 1; + + replace = re.compile("static "); + ret = replace.sub("", ret); + replace = re.compile("inline "); + ret = replace.sub("", ret); + replace = re.compile("\)"); + arched_args = replace.sub(", const char* arch) {", args); + + remove = re.compile('\)|\(|{'); + rargs = remove.sub("", args); + sargs = rargs.split(','); + + + + margs = []; + atypes = []; + for arg in sargs: + temp = arg.split(" "); + margs.append(temp[-1]); + replace = re.compile(" " + temp[-1]); + atypes.append(replace.sub("", arg)); + + + my_args = "" + arg_types = "" + for arg in range(0, len(margs) - 1): + this_arg = leading_space_remove.sub("", margs[arg]); + my_args = my_args + this_arg + ", "; + this_type = leading_space_remove.sub("", atypes[arg]); + arg_types = arg_types + this_type + ", "; + + this_arg = leading_space_remove.sub("", margs[-1]); + my_args = my_args + this_arg; + this_type = leading_space_remove.sub("", atypes[-1]); + arg_types = arg_types + this_type; + my_argtypelist.append(arg_types); + + if(ret[-1] != ' '): + ret = ret + ' '; + + arched_arglist.append(arched_args); #!!!!!!!!!!! + my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! + retlist.append(ret); + fcountlist.append(fcount); + taglist.append(tags); + +outfile_mktables.write(make_mktables(functions)); +outfile_mktables.close(); + + +outfile_cpu_h.write(make_cpuid_h(filearchs)); +outfile_cpu_h.close(); + +outfile_cpu_x86_c.write(make_cpuid_x86_c(filearchs)); +outfile_cpu_x86_c.close(); + +outfile_proccpu_sim.write(make_proccpu_sim(filearchs)); +outfile_proccpu_sim.close(); + +outfile_set_simd.write(make_set_simd(filearchs)); +outfile_set_simd.close(); + +outfile_cpu_generic_c.write(make_cpuid_generic_c(filearchs)); +outfile_cpu_generic_c.close(); + +outfile_cpu_powerpc_c.write(make_cpuid_powerpc_c(filearchs)); +outfile_cpu_powerpc_c.close(); + +outfile_config_in.write(make_config_in(filearchs)); +outfile_config_in.close(); + +outfile_reg.write(make_registry(filearchs, functions, fcountlist)); +outfile_reg.close(); + +outfile_h.write(make_h(functions, arched_arglist, retlist)); +outfile_h.close(); + +outfile_init_h.write(make_init_h(functions, arched_arglist, retlist)); +outfile_init_h.close(); + +outfile_config_fixed.write(make_config_fixed(filearchs)); +outfile_config_fixed.close(); + +outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_c.close(); + +outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_runtime_c.close(); + +outfile_init_c.write(make_init_c(functions, filearchs)); +outfile_init_c.close(); + +outfile_runtime.write(make_runtime(functions)); +outfile_runtime.close(); + +outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); +outfile_typedefs.close(); + +outfile_environment_c.write(make_environment_init_c(filearchs)); +outfile_environment_c.close(); + +outfile_environment_h.write(make_environment_init_h()); +outfile_environment_h.close(); -- cgit From 1186ab980d1e1fe36ff869097fd0697dd41ff7ae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 00:59:23 -0500 Subject: volk: Updating build system so that volk configures and builds from gnuradio top level dir. --- volk/include/volk/Makefile.am | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index fd59ab795..94c78ffff 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -20,7 +20,9 @@ include $(top_srcdir)/Makefile.common -ourinclude_HEADERS = \ +volkincludedir = $(prefix)/include/volk + +volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ -- cgit From f8b0c86d8a9eb347cb7187e3b01ed46c66de6a64 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:09:35 -0500 Subject: volk: Adding gitignore files. --- volk/include/.gitignore | 10 ++++++++++ volk/include/volk/.gitignore | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 volk/include/.gitignore create mode 100644 volk/include/volk/.gitignore (limited to 'volk/include') diff --git a/volk/include/.gitignore b/volk/include/.gitignore new file mode 100644 index 000000000..378f771f5 --- /dev/null +++ b/volk/include/.gitignore @@ -0,0 +1,10 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore new file mode 100644 index 000000000..ae4824cc3 --- /dev/null +++ b/volk/include/volk/.gitignore @@ -0,0 +1,19 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/volk.h +/volk_config.h +/volk_config_fixed.h +/volk_cpu.h +/volk_environment_init.h +/volk_registry.h +/volk_runtime.h +/volk_tables.h +/volk_typedefs.h -- cgit From 74f206edb2c7bfbe010b5a5cbc5fe2f07965c3a6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:29:58 -0500 Subject: volk: Fixing makefiles for dist. Distcheck still failing on other issues now. --- volk/include/volk/Makefile.am | 1 + 1 file changed, 1 insertion(+) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 94c78ffff..9894fdb99 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -30,6 +30,7 @@ volkinclude_HEADERS = \ volk_config.h \ volk_tables.h \ volk_typedefs.h \ + volk_registry.h \ volk.h \ volk_cpu.h \ volk_environment_init.h \ -- cgit From 46d55649012e4fb2838a6f8e9f3c9226ea8b2d50 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 12:19:28 -0500 Subject: volk: Working on VPATH build issues. Makes it through configure, fails on make. --- volk/include/volk/make_set_simd.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index e320bc748..c35aa06fa 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -98,7 +98,6 @@ def make_set_simd(dom) : tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; tempstring = tempstring + " fi\n"; - tempstring = tempstring + " echo $cf_with_lv_arch\n"; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); -- cgit From 1cc88091470dd4654b6936cda92d81841e135209 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:00:38 -0500 Subject: volk: more changes to build system so that VPATH builds properly and project makes distcheck. --- volk/include/volk/make_set_simd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index c35aa06fa..03c2b2d8f 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -12,7 +12,7 @@ def make_set_simd(dom) : tempstring = tempstring + " (x86)\n"; tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I $srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -23,7 +23,7 @@ def make_set_simd(dom) : tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " (*)\n" - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -36,7 +36,7 @@ def make_set_simd(dom) : tempstring = tempstring + " esac\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (powerpc)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -47,7 +47,7 @@ def make_set_simd(dom) : tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (*)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_generic.c 2>&1`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; -- cgit From 79ca8f230d0f0758656e32a2bb0317f03e3317d0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 16:52:33 -0500 Subject: volk: remove redundant warning message at top of volk.c file. --- volk/include/volk/make_c.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 08d6d949c..f2432d7a4 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -5,8 +5,6 @@ from volk_regexp import * def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; -- cgit From 97ff51156b492a1e52420a1f12cd75eccf2235ac Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 16:53:06 -0500 Subject: volk: Removing erroneous SSE function that actually usese an SSE3 intrin (mm_hadd_ps). --- .../volk/volk_16sc_magnitude_32f_aligned16.h | 66 ---------------------- 1 file changed, 66 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 29c58ceb8..8ba5737e8 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -70,72 +70,6 @@ static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - #if LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector -- cgit From 87889c20c7551b39bb508408035c894dfc32fb0d Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:09:23 -0500 Subject: volk: modified the configure scripts to output which architectures it will be building based on the configure tests. --- volk/include/volk/make_set_simd.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 03c2b2d8f..279ce06e2 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -1,3 +1,20 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + from xml.dom import minidom def make_set_simd(dom) : @@ -93,6 +110,7 @@ def make_set_simd(dom) : arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; @@ -138,6 +156,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" @@ -182,6 +201,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" -- cgit From 61d7638304db3c999bd43808fce0e531996dba87 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 20:24:21 -0500 Subject: volk: readding 16sc_magnitude_32f_sse with fix for SSE hadd_ps error. --- .../volk/volk_16sc_magnitude_32f_aligned16.h | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 8ba5737e8..9c2a48835 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -70,6 +70,76 @@ static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector } #endif /* LV_HAVE_SSE3 */ +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + #if LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector -- cgit From 8375fd6ca2f6e5edb923abe0d6341b6d4d2d1aae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 01:48:17 -0500 Subject: volk: Fixing build system to handle making volk_mktables, volk_tables.h, and volk_config.h instead of a standalone shell script. --- volk/include/volk/Makefile.am | 48 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9894fdb99..256cc8420 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -20,8 +20,15 @@ include $(top_srcdir)/Makefile.common +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ + $(LV_CXXFLAGS) $(WITH_INCLUDES) + volkincludedir = $(prefix)/include/volk +BUILT_SOURCES: \ + volk_config.h \ + volk_tables.h + volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ @@ -122,6 +129,43 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_aligned16.h \ volk_8s_convert_32f_unaligned16.h +VOLK_MKTABLES_SOURCES = \ + $(abs_top_srcdir)/lib/volk_rank_archs.c \ + $(abs_top_srcdir)/lib/volk_mktables.c + + +# FIXME: Not very extensible to supporting more processors easily +if MYSUBCPU_X86_64 + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_x86.c \ + $(abs_top_srcdir)/lib/cpuid_x86_64.S +endif + +if MYSUBCPU_X86 + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_x86.c \ + $(abs_top_srcdir)/lib/cpuid_x86.S +endif + +if MYSUBCPU_POWERPC + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_powerpc.c +endif + +if MYSUBCPU_GENERIC + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_generic.c +endif + +volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) + $(CC) -o $@ $^ $(AM_CPPFLAGS) + +volk_tables.h: volk_mktables$(EXEEXT) + ./volk_mktables$(EXEEXT) + +volk_config.h: $(abs_top_builddir)/volk_config.h + cp $^ $(abs_top_builddir)/include/volk/$@ + distclean-local: rm -f volk_config_fixed.h rm -f volk_config.h @@ -133,4 +177,6 @@ distclean-local: rm -f volk_tables.h rm -f *.pyc rm -f Makefile.in - rm -f volk_environment_init.h \ No newline at end of file + rm -f volk_environment_init.h + rm -f volk_mktables + rm -f $(BUILT_SOURCES) \ No newline at end of file -- cgit From e7aa093f30c48c3c85abf446e24bc1cf144dc3e0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 02:22:20 -0500 Subject: volk: May be a hack, but it was required for my 32-bit Fedora 13 to work. --- volk/include/volk/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 256cc8420..9cf783c0f 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -158,7 +158,7 @@ if MYSUBCPU_GENERIC endif volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) + $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(abs_top_builddir)/include volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -- cgit From 88a4f3fcddb5a082caa39fe800ea893aface1280 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 11 Dec 2010 13:35:00 -0500 Subject: volk: changing the path variables again. This works on my various systems tested. Using abs_ path names failed on Ubuntu 8.04 32-bit. --- volk/include/volk/Makefile.am | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9cf783c0f..04a43bd34 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -130,41 +130,41 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_unaligned16.h VOLK_MKTABLES_SOURCES = \ - $(abs_top_srcdir)/lib/volk_rank_archs.c \ - $(abs_top_srcdir)/lib/volk_mktables.c + $(top_srcdir)/lib/volk_rank_archs.c \ + $(top_srcdir)/lib/volk_mktables.c # FIXME: Not very extensible to supporting more processors easily if MYSUBCPU_X86_64 VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_x86.c \ - $(abs_top_srcdir)/lib/cpuid_x86_64.S + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86_64.S endif if MYSUBCPU_X86 VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_x86.c \ - $(abs_top_srcdir)/lib/cpuid_x86.S + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86.S endif if MYSUBCPU_POWERPC VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_powerpc.c + $(top_srcdir)/lib/volk_cpu_powerpc.c endif if MYSUBCPU_GENERIC VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_generic.c + $(top_srcdir)/lib/volk_cpu_generic.c endif volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(abs_top_builddir)/include + $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -volk_config.h: $(abs_top_builddir)/volk_config.h - cp $^ $(abs_top_builddir)/include/volk/$@ +volk_config.h: $(top_builddir)/volk_config.h + cp $^ $(top_builddir)/include/volk/$@ distclean-local: rm -f volk_config_fixed.h -- cgit From 6cd7639e86285c2fc0f3771fd1111c4456d477a4 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 11 Dec 2010 13:39:00 -0500 Subject: volk: fix for running 32-bit OS on 64-bit processor. System is correctly identified as 32-bit and compiles with the correct flags. --- volk/include/volk/archs.xml | 10 +++++----- volk/include/volk/make_cpuid_x86_c.py | 26 +++++++++++++++++++++----- volk/include/volk/make_set_simd.py | 25 +++++++++++++++++++++---- 3 files changed, 47 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index c9c441008..b7c98500f 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -9,12 +9,10 @@ maltivec - - 0x80000001 + m32 - d - 29 - 0 + MD_SUBCPU + x86_64 @@ -23,6 +21,8 @@ 29 m64 1 + MD_SUBCPU + x86 diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index fd83e5e5c..8ebe243e5 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -33,17 +33,33 @@ def make_cpuid_x86_c(dom) : for domarch in dom: if str(domarch.attributes["type"].value) == "x86": + if "no_test" in domarch.attributes.keys(): + no_test = str(domarch.attributes["no_test"].value); + if no_test == "true": + no_test = True; + else: + no_test = False; + else: + no_test = False; arch = str(domarch.attributes["name"].value); op = domarch.getElementsByTagName("op"); - op = str(op[0].firstChild.data); + if op: + op = str(op[0].firstChild.data); reg = domarch.getElementsByTagName("reg"); - reg = str(reg[0].firstChild.data); + if reg: + reg = str(reg[0].firstChild.data); shift = domarch.getElementsByTagName("shift"); - shift = str(shift[0].firstChild.data); + if shift: + shift = str(shift[0].firstChild.data); val = domarch.getElementsByTagName("val"); - val = str(val[0].firstChild.data); + if val: + val = str(val[0].firstChild.data); - if op == "1": + if no_test: + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + elif op == "1": tempstring = tempstring + "int i_can_has_" + arch + " () {\n" tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 279ce06e2..842366b18 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -112,9 +112,11 @@ def make_set_simd(dom) : tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; - + tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; + tempstring = tempstring + " fi\n"; for domarch in dom: if str(domarch.attributes["type"].value) != "all": @@ -132,6 +134,16 @@ def make_set_simd(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); + overrule = domarch.getElementsByTagName("overrule"); + if overrule: + overrule = str(overrule[0].firstChild.data); + else: + overrule = ""; + overrule_val = domarch.getElementsByTagName("overrule_val"); + if overrule_val: + overrule_val = str(overrule_val[0].firstChild.data); + else: + overrule_val = ""; flag = domarch.getElementsByTagName("flag"); flag = str(flag[0].firstChild.data); if atype == "x86": @@ -153,10 +165,15 @@ def make_set_simd(dom) : tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" tempstring = tempstring + " done\n" - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" @@ -201,7 +218,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" -- cgit From bef3db60e73953f2d2ecdc6a86a81e11df3b103d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Mon, 13 Dec 2010 19:18:45 -0800 Subject: volk: committed some stuff i neglected --- volk/include/volk/Makefile.am | 2 +- volk/include/volk/archs.xml | 4 ++++ volk/include/volk/make_set_simd.py | 3 +++ volk/include/volk/volk_32f_sqrt_aligned16.h | 13 +++++++++++++ 4 files changed, 21 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 04a43bd34..99276ab87 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -179,4 +179,4 @@ distclean-local: rm -f Makefile.in rm -f volk_environment_init.h rm -f volk_mktables - rm -f $(BUILT_SOURCES) \ No newline at end of file + rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index b7c98500f..a828e5ad0 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -5,6 +5,10 @@ none + + lorc-0.4 + + maltivec diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 842366b18..e568aebfa 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -111,6 +111,9 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; + tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n"; + tempstring = tempstring + " fi\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h index 0b2eaf251..f6996ad5f 100644 --- a/volk/include/volk/volk_32f_sqrt_aligned16.h +++ b/volk/include/volk/volk_32f_sqrt_aligned16.h @@ -58,6 +58,19 @@ static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ -- cgit From 05f4bced29987a0a573d1fc5b214f3fa01dc84bd Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 13:36:55 -0800 Subject: Volk: More autotools stuff for Orc. Should build OK with or without Orc now. --- volk/include/volk/make_set_simd.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index e568aebfa..78a00476d 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -183,8 +183,14 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + if arch == "orc": + tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n"; + else: + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" -- cgit From be78b530701850b964118fd0f63ba2bbdca9759d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 14:14:00 -0800 Subject: pre-patch... --- volk/include/volk/volk_32s_and_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_8s_convert_16s_aligned16.h | 12 ++++++++++++ volk/include/volk/volk_8s_convert_32f_aligned16.h | 13 +++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h index e9f1e3a43..16c63fd48 100644 --- a/volk/include/volk/volk_32s_and_aligned16.h +++ b/volk/include/volk/volk_32s_and_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_ } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h index 0efe3c6a1..c52c64eae 100644 --- a/volk/include/volk/volk_8s_convert_16s_aligned16.h +++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h @@ -65,6 +65,18 @@ static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h index 54b66ef8f..700a0fa42 100644 --- a/volk/include/volk/volk_8s_convert_32f_aligned16.h +++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h @@ -86,6 +86,19 @@ static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, co } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ -- cgit From 2e9a7d350713b4e1b21458db8f3fce8a557858ae Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 17:13:40 -0800 Subject: Volk: Added QA tests for all the Orc stuff. Added a 16u_byteswap but it's broken right now. --- volk/include/volk/volk_16u_byteswap_aligned16.h | 12 ++++++++++++ volk/include/volk/volk_32f_add_aligned16.h | 14 +++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h index 698e958e4..8e628ab17 100644 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -61,5 +61,17 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points); +static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h index 721c60fd6..e7d8de265 100644 --- a/volk/include/volk/volk_32f_add_aligned16.h +++ b/volk/include/volk/volk_32f_add_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32f_add_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ -- cgit From 26415a1445490cc3230c5d793a41703931ae9d01 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 17:23:20 -0800 Subject: Volk: Nick's commits to make adding Orc a little more structurally sound --- volk/include/volk/archs.xml | 2 ++ volk/include/volk/make_set_simd.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index a828e5ad0..b61bfca09 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -7,6 +7,8 @@ lorc-0.4 + HAVE_ORC + no diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 78a00476d..d7109cfcb 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -110,10 +110,7 @@ def make_set_simd(dom) : arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; - tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n"; - tempstring = tempstring + " fi\n"; + tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; @@ -183,14 +180,11 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - if arch == "orc": - tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n"; - else: - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" @@ -234,11 +228,15 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": + tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" + tempstring = tempstring + " AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n"; tempstring = tempstring + "])\n" return tempstring; -- cgit From 21426265324c883c91eeaaf75a81f2ccdc6e249d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 21:12:49 -0800 Subject: Volk: Build fixes to work with/without Orc. --- volk/include/volk/archs.xml | 2 +- volk/include/volk/make_set_simd.py | 51 +++++++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index b61bfca09..a19a5add9 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -7,7 +7,7 @@ lorc-0.4 - HAVE_ORC + LV_HAVE_ORC no diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index d7109cfcb..f2b7c0656 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -180,11 +180,22 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" @@ -228,11 +239,44 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " (*)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "all": + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" @@ -240,6 +284,5 @@ def make_set_simd(dom) : tempstring = tempstring + "])\n" return tempstring; - - + -- cgit From f9ee6a55cb397f9302769a25a8c959fa162354f0 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 22:58:33 -0800 Subject: Volk: Some new basic Orc implementations with QA code --- volk/include/volk/volk_16u_byteswap_aligned16.h | 4 ++-- volk/include/volk/volk_32f_divide_aligned16.h | 13 +++++++++++++ volk/include/volk/volk_32f_multiply_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_32f_subtract_aligned16.h | 14 ++++++++++++++ 4 files changed, 42 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h index 8e628ab17..9d19d1a45 100644 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -67,9 +67,9 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points); +extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points); + volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h index c00700cd8..c595b5e92 100644 --- a/volk/include/volk/volk_32f_divide_aligned16.h +++ b/volk/include/volk/volk_32f_divide_aligned16.h @@ -63,6 +63,19 @@ static inline void volk_32f_divide_aligned16_generic(float* cVector, const float } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h index b557580ab..87ae7bcf8 100644 --- a/volk/include/volk/volk_32f_multiply_aligned16.h +++ b/volk/include/volk/volk_32f_multiply_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32f_multiply_aligned16_generic(float* cVector, const flo } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h index ac3f5e5d1..e15242901 100644 --- a/volk/include/volk/volk_32f_subtract_aligned16.h +++ b/volk/include/volk/volk_32f_subtract_aligned16.h @@ -63,5 +63,19 @@ static inline void volk_32f_subtract_aligned16_generic(float* cVector, const flo } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ -- cgit From 15ad4b5398e474bfb52fdb7e826b69f3e398c0b0 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 15 Dec 2010 16:27:42 -0800 Subject: Volk: A bunch of new ORC routines plus tests. Also fixed a typo in the generic version of 16sc_magnitude_16s_a16. --- volk/include/volk/volk_16sc_magnitude_16s_aligned16.h | 15 +++++++++++++-- volk/include/volk/volk_16sc_magnitude_32f_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_32fc_magnitude_16s_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_32fc_magnitude_32f_aligned16.h | 13 ++++++++++++- volk/include/volk/volk_32s_or_aligned16.h | 14 +++++++++++++- 5 files changed, 64 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h index 1482ab82e..9f3222aa6 100644 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -164,7 +164,7 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; - const float scalar = 32786.0; + const float scalar = 32768.0; for(number = 0; number < num_points; number++){ float real = ((float)(*complexVectorPtr++)) / scalar; float imag = ((float)(*complexVectorPtr++)) / scalar; @@ -173,7 +173,18 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 9c2a48835..e063ae432 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -161,7 +161,19 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h index 4e64d8c22..4e590e120 100644 --- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h @@ -140,7 +140,19 @@ static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h index 7a8fd1ef9..3ea62da6a 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h @@ -115,7 +115,18 @@ static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h index f4c427c4d..64748d535 100644 --- a/volk/include/volk/volk_32s_or_aligned16.h +++ b/volk/include/volk/volk_32s_or_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ -- cgit From 320c16f5acdcbca6f59c735ad19619d7b82c434b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 16 Dec 2010 21:27:38 -0500 Subject: Changed python env variable to more globally usable version. --- volk/include/volk/volk_register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index d00501a93..9fded9a3e 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -1,4 +1,4 @@ -#! /usr/bin/python +#! /usr/bin/env python import sys import re -- cgit From c6fff77de9b686761f93f0e1de237f8543f5e919 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 17 Dec 2010 11:14:41 -0800 Subject: Volk: A bunch of new Orc routines plus a couple of build changes. 32fc_magnitude_16s fails test_all right now. --- volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h | 15 ++++++++++++++- .../volk/volk_16sc_deinterleave_real_8s_aligned16.h | 13 ++++++++++++- volk/include/volk/volk_16sc_magnitude_16s_aligned16.h | 6 +++--- volk/include/volk/volk_16sc_magnitude_32f_aligned16.h | 2 +- volk/include/volk/volk_32f_max_aligned16.h | 14 ++++++++++++++ volk/include/volk/volk_32f_min_aligned16.h | 14 ++++++++++++++ 7 files changed, 71 insertions(+), 7 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h index 32e13df98..cf94a3f38 100644 --- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h @@ -140,7 +140,19 @@ static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h index 86f67437d..50b8b62d5 100644 --- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h @@ -89,7 +89,20 @@ static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h index c0d1e941a..2dd85a422 100644 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h @@ -77,7 +77,18 @@ static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuf } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h index 9f3222aa6..41e8751d6 100644 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -173,16 +173,16 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#if LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points); +extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points); + volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index e063ae432..c2605d551 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -161,7 +161,7 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#if LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h index 96aafb2bf..d4e30fba8 100644 --- a/volk/include/volk/volk_32f_max_aligned16.h +++ b/volk/include/volk/volk_32f_max_aligned16.h @@ -67,5 +67,19 @@ static inline void volk_32f_max_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h index e247f4213..55daafb6a 100644 --- a/volk/include/volk/volk_32f_min_aligned16.h +++ b/volk/include/volk/volk_32f_min_aligned16.h @@ -67,5 +67,19 @@ static inline void volk_32f_min_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ -- cgit From ed78ba5d9999bbe50507373a1aa2877ef0da64c6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 17 Dec 2010 22:58:40 -0600 Subject: volk: Fix for popcnt's 64/32-bit issues. --- volk/include/volk/volk_64u_popcnt_aligned16.h | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h index e8e1396e7..4d62f9375 100644 --- a/volk/include/volk/volk_64u_popcnt_aligned16.h +++ b/volk/include/volk/volk_64u_popcnt_aligned16.h @@ -36,37 +36,13 @@ static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64 #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 +#if LV_HAVE_SSE4_2 && LV_HAVE_64 #include static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { -#if LV_64 *ret = _mm_popcnt_u64(value); -#else - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; -#endif } #endif /*LV_HAVE_SSE4_2*/ -- cgit From 79c514b542d25e709903b41cfdc1673aae35ac1d Mon Sep 17 00:00:00 2001 From: Eric Blossom Date: Thu, 23 Dec 2010 14:29:56 -0800 Subject: Update volk .gitignores --- volk/include/volk/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'volk/include') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index ae4824cc3..be8358f3a 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -17,3 +17,4 @@ /volk_runtime.h /volk_tables.h /volk_typedefs.h +/volk_mktables -- cgit From 5b45b875ed58fd66234764a05da42c6eaff22c4d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 11 Jan 2011 15:17:55 -0800 Subject: Volk: Added more Orc routines (including complex multiply). Started redoing the testing framework so it's easier to add new archs to tests. --- volk/include/volk/volk_32f_normalize_aligned16.h | 15 +++++++++++++++ volk/include/volk/volk_32fc_32f_multiply_aligned16.h | 13 +++++++++++++ volk/include/volk/volk_32fc_multiply_aligned16.h | 17 +++++++++++++++++ 3 files changed, 45 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h index 1aabb1d9d..27fb5f7fa 100644 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -60,6 +60,21 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points); +static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h index 436656ca0..304ed8e2d 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h @@ -76,6 +76,19 @@ static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h index 6a1649fdb..c8f2418c3 100644 --- a/volk/include/volk/volk_32fc_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_multiply_aligned16.h @@ -4,6 +4,7 @@ #include #include #include +#include #if LV_HAVE_SSE3 #include @@ -72,6 +73,22 @@ static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + -- cgit From c501dc110d3cc7cfcfff178fecb21f30ac9bd54c Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 11 Jan 2011 15:35:04 -0800 Subject: Volk: fixed normalize. --- volk/include/volk/volk_32f_normalize_aligned16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h index 27fb5f7fa..323d0513c 100644 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -68,10 +68,10 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points); +extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points); + volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } #endif /* LV_HAVE_GENERIC */ -- cgit From d486ff4b4c039c8b3b06b6519839d522cf69be69 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sun, 16 Jan 2011 14:03:16 -0800 Subject: volk_rename: renamed basically everything in the volk lib to have logically consistent function names --- volk/include/volk/Makefile.am | 174 ++++---- volk/include/volk/make_c.py | 3 +- volk/include/volk/volk_16s_add_quad_a16.h | 136 ++++++ volk/include/volk/volk_16s_add_quad_aligned16.h | 136 ------ volk/include/volk/volk_16s_branch_4_state_8_a16.h | 194 +++++++++ .../volk/volk_16s_branch_4_state_8_aligned16.h | 194 --------- volk/include/volk/volk_16s_convert_32f_aligned16.h | 119 ------ .../volk/volk_16s_convert_32f_unaligned16.h | 122 ------ volk/include/volk/volk_16s_convert_8s_a16.h | 69 +++ volk/include/volk/volk_16s_convert_8s_aligned16.h | 69 --- volk/include/volk/volk_16s_convert_8s_ua16.h | 71 ++++ .../include/volk/volk_16s_convert_8s_unaligned16.h | 71 ---- volk/include/volk/volk_16s_max_star_16s_a16.h | 108 +++++ volk/include/volk/volk_16s_max_star_aligned16.h | 108 ----- .../volk/volk_16s_max_star_horizontal_16s_a16.h | 130 ++++++ .../volk/volk_16s_max_star_horizontal_aligned16.h | 130 ------ .../volk/volk_16s_permute_and_scalar_add_a16.h | 139 ++++++ .../volk_16s_permute_and_scalar_add_aligned16.h | 139 ------ volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 +++++++++ .../volk/volk_16s_quad_max_star_aligned16.h | 191 --------- volk/include/volk/volk_16s_s32f_convert_32f_a16.h | 119 ++++++ volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ++++++ .../volk/volk_16sc_deinterleave_16s_16s_a16.h | 158 +++++++ .../volk/volk_16sc_deinterleave_16s_aligned16.h | 158 ------- .../volk/volk_16sc_deinterleave_32f_aligned16.h | 108 ----- .../volk/volk_16sc_deinterleave_real_16s_a16.h | 120 ++++++ .../volk_16sc_deinterleave_real_16s_aligned16.h | 120 ------ .../volk_16sc_deinterleave_real_32f_aligned16.h | 125 ------ .../volk/volk_16sc_deinterleave_real_8s_a16.h | 94 +++++ .../volk_16sc_deinterleave_real_8s_aligned16.h | 94 ----- volk/include/volk/volk_16sc_magnitude_16s_a16.h | 190 +++++++++ .../volk/volk_16sc_magnitude_16s_aligned16.h | 190 --------- .../volk/volk_16sc_magnitude_32f_aligned16.h | 179 -------- .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 +++++ .../volk_16sc_s32f_deinterleave_real_32f_a16.h | 125 ++++++ .../volk/volk_16sc_s32f_magnitude_32f_a16.h | 179 ++++++++ volk/include/volk/volk_16u_byteswap_a16.h | 77 ++++ volk/include/volk/volk_16u_byteswap_aligned16.h | 77 ---- .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h | 151 +++++++ volk/include/volk/volk_32f_32f_add_32f_a16.h | 81 ++++ volk/include/volk/volk_32f_32f_divide_32f_a16.h | 82 ++++ volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h | 184 ++++++++ volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 ++++++++ .../volk/volk_32f_32f_interleave_32fc_a16.h | 75 ++++ volk/include/volk/volk_32f_32f_max_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_32f_min_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_32f_multiply_32f_a16.h | 81 ++++ .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h | 155 +++++++ volk/include/volk/volk_32f_32f_subtract_32f_a16.h | 81 ++++ volk/include/volk/volk_32f_accumulator_aligned16.h | 67 --- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 67 +++ volk/include/volk/volk_32f_add_aligned16.h | 81 ---- .../volk/volk_32f_calc_spectral_noise_floor_a16.h | 167 ++++++++ .../volk_32f_calc_spectral_noise_floor_aligned16.h | 167 -------- volk/include/volk/volk_32f_convert_16s_aligned16.h | 110 ----- .../volk/volk_32f_convert_16s_unaligned16.h | 113 ----- volk/include/volk/volk_32f_convert_32s_aligned16.h | 106 ----- .../volk/volk_32f_convert_32s_unaligned16.h | 109 ----- volk/include/volk/volk_32f_convert_64f_a16.h | 70 +++ volk/include/volk/volk_32f_convert_64f_aligned16.h | 70 --- volk/include/volk/volk_32f_convert_64f_ua16.h | 70 +++ .../volk/volk_32f_convert_64f_unaligned16.h | 70 --- volk/include/volk/volk_32f_convert_8s_aligned16.h | 117 ------ .../include/volk/volk_32f_convert_8s_unaligned16.h | 120 ------ volk/include/volk/volk_32f_divide_aligned16.h | 82 ---- volk/include/volk/volk_32f_dot_prod_aligned16.h | 184 -------- volk/include/volk/volk_32f_dot_prod_unaligned16.h | 184 -------- volk/include/volk/volk_32f_fm_detect_aligned16.h | 120 ------ volk/include/volk/volk_32f_index_max_16u_a16.h | 148 +++++++ volk/include/volk/volk_32f_index_max_aligned16.h | 148 ------- .../volk/volk_32f_interleave_16sc_aligned16.h | 155 ------- .../volk/volk_32f_interleave_32fc_aligned16.h | 75 ---- volk/include/volk/volk_32f_max_aligned16.h | 85 ---- volk/include/volk/volk_32f_min_aligned16.h | 85 ---- volk/include/volk/volk_32f_multiply_aligned16.h | 81 ---- volk/include/volk/volk_32f_normalize_aligned16.h | 81 ---- volk/include/volk/volk_32f_power_aligned16.h | 144 ------- .../volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 120 ++++++ volk/include/volk/volk_32f_s32f_convert_16s_a16.h | 110 +++++ volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 +++++ volk/include/volk/volk_32f_s32f_convert_32s_a16.h | 106 +++++ volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 +++++ volk/include/volk/volk_32f_s32f_convert_8s_a16.h | 117 ++++++ volk/include/volk/volk_32f_s32f_convert_8s_ua16.h | 120 ++++++ volk/include/volk/volk_32f_s32f_normalize_a16.h | 81 ++++ volk/include/volk/volk_32f_s32f_power_32f_a16.h | 144 +++++++ volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 144 +++++++ volk/include/volk/volk_32f_sqrt_32f_a16.h | 77 ++++ volk/include/volk/volk_32f_sqrt_aligned16.h | 77 ---- volk/include/volk/volk_32f_stddev_aligned16.h | 144 ------- .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h | 169 ++++++++ .../volk/volk_32f_stddev_and_mean_aligned16.h | 169 -------- volk/include/volk/volk_32f_subtract_aligned16.h | 81 ---- volk/include/volk/volk_32f_sum_of_poly_aligned16.h | 151 ------- .../include/volk/volk_32fc_32f_multiply_32fc_a16.h | 95 +++++ .../volk/volk_32fc_32f_multiply_aligned16.h | 95 ----- volk/include/volk/volk_32fc_32f_power_32fc_a16.h | 109 +++++ .../volk/volk_32fc_32f_power_32fc_aligned16.h | 109 ----- .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h | 344 +++++++++++++++ .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++ .../volk/volk_32fc_32fc_multiply_32fc_a16.h | 95 +++++ ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++ .../volk/volk_32fc_32fc_square_dist_32f_a16.h | 112 +++++ volk/include/volk/volk_32fc_atan2_32f_aligned16.h | 158 ------- .../volk/volk_32fc_conjugate_dot_prod_aligned16.h | 344 --------------- .../volk/volk_32fc_deinterleave_32f_32f_a16.h | 75 ++++ .../volk/volk_32fc_deinterleave_32f_aligned16.h | 75 ---- .../volk/volk_32fc_deinterleave_64f_64f_a16.h | 78 ++++ .../volk/volk_32fc_deinterleave_64f_aligned16.h | 78 ---- .../volk/volk_32fc_deinterleave_real_16s_a16.h | 80 ++++ .../volk_32fc_deinterleave_real_16s_aligned16.h | 80 ---- .../volk/volk_32fc_deinterleave_real_32f_a16.h | 68 +++ .../volk_32fc_deinterleave_real_32f_aligned16.h | 68 --- .../volk/volk_32fc_deinterleave_real_64f_a16.h | 66 +++ .../volk_32fc_deinterleave_real_64f_aligned16.h | 66 --- volk/include/volk/volk_32fc_dot_prod_aligned16.h | 468 --------------------- volk/include/volk/volk_32fc_index_max_16u_a16.h | 215 ++++++++++ volk/include/volk/volk_32fc_index_max_aligned16.h | 215 ---------- .../volk/volk_32fc_magnitude_16s_aligned16.h | 158 ------- volk/include/volk/volk_32fc_magnitude_32f_a16.h | 132 ++++++ .../volk/volk_32fc_magnitude_32f_aligned16.h | 132 ------ volk/include/volk/volk_32fc_multiply_aligned16.h | 95 ----- ...olk_32fc_power_spectral_density_32f_aligned16.h | 134 ------ .../volk/volk_32fc_power_spectrum_32f_aligned16.h | 126 ------ volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 158 +++++++ .../volk/volk_32fc_s32f_magnitude_16s_a16.h | 158 +++++++ .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 126 ++++++ ...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ++++++ .../include/volk/volk_32fc_square_dist_aligned16.h | 112 ----- .../volk_32fc_square_dist_scalar_mult_aligned16.h | 126 ------ volk/include/volk/volk_32s_32s_and_32s_a16.h | 81 ++++ volk/include/volk/volk_32s_32s_or_32s_a16.h | 81 ++++ volk/include/volk/volk_32s_and_aligned16.h | 81 ---- volk/include/volk/volk_32s_convert_32f_aligned16.h | 73 ---- .../volk/volk_32s_convert_32f_unaligned16.h | 75 ---- volk/include/volk/volk_32s_or_aligned16.h | 81 ---- volk/include/volk/volk_32s_s32f_convert_32f_a16.h | 73 ++++ volk/include/volk/volk_32s_s32f_convert_32f_ua16.h | 75 ++++ volk/include/volk/volk_32u_byteswap_a16.h | 77 ++++ volk/include/volk/volk_32u_byteswap_aligned16.h | 77 ---- volk/include/volk/volk_32u_popcnt_a16.h | 36 ++ volk/include/volk/volk_32u_popcnt_aligned16.h | 36 -- volk/include/volk/volk_64f_64f_max_64f_a16.h | 71 ++++ volk/include/volk/volk_64f_64f_min_64f_a16.h | 71 ++++ volk/include/volk/volk_64f_convert_32f_a16.h | 67 +++ volk/include/volk/volk_64f_convert_32f_aligned16.h | 67 --- volk/include/volk/volk_64f_convert_32f_ua16.h | 67 +++ .../volk/volk_64f_convert_32f_unaligned16.h | 67 --- volk/include/volk/volk_64f_max_aligned16.h | 71 ---- volk/include/volk/volk_64f_min_aligned16.h | 71 ---- volk/include/volk/volk_64u_byteswap_a16.h | 88 ++++ volk/include/volk/volk_64u_byteswap_aligned16.h | 88 ---- volk/include/volk/volk_64u_popcnt_a16.h | 50 +++ volk/include/volk/volk_64u_popcnt_aligned16.h | 50 --- volk/include/volk/volk_8s_convert_16s_a16.h | 83 ++++ volk/include/volk/volk_8s_convert_16s_aligned16.h | 83 ---- volk/include/volk/volk_8s_convert_16s_ua16.h | 73 ++++ .../include/volk/volk_8s_convert_16s_unaligned16.h | 73 ---- volk/include/volk/volk_8s_convert_32f_aligned16.h | 105 ----- .../include/volk/volk_8s_convert_32f_unaligned16.h | 94 ----- volk/include/volk/volk_8s_s32f_convert_32f_a16.h | 105 +++++ volk/include/volk/volk_8s_s32f_convert_32f_ua16.h | 94 +++++ .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h | 102 +++++ ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++ .../volk/volk_8sc_deinterleave_16s_16s_a16.h | 77 ++++ .../volk/volk_8sc_deinterleave_16s_aligned16.h | 77 ---- .../volk/volk_8sc_deinterleave_32f_aligned16.h | 164 -------- .../volk/volk_8sc_deinterleave_real_16s_a16.h | 66 +++ .../volk_8sc_deinterleave_real_16s_aligned16.h | 66 --- .../volk_8sc_deinterleave_real_32f_aligned16.h | 133 ------ .../volk/volk_8sc_deinterleave_real_8s_a16.h | 67 +++ .../volk/volk_8sc_deinterleave_real_8s_aligned16.h | 67 --- .../volk_8sc_multiply_conjugate_16sc_aligned16.h | 102 ----- .../volk_8sc_multiply_conjugate_32fc_aligned16.h | 122 ------ .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h | 164 ++++++++ .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ++++++ volk/include/volk/volk_register.py | 5 +- 177 files changed, 10158 insertions(+), 10160 deletions(-) create mode 100644 volk/include/volk/volk_16s_add_quad_a16.h delete mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h delete mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_16u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h create mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h create mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h delete mode 100644 volk/include/volk/volk_32f_add_aligned16.h create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_divide_aligned16.h delete mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h delete mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h create mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h delete mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h delete mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h delete mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h delete mode 100644 volk/include/volk/volk_32f_max_aligned16.h delete mode 100644 volk/include/volk/volk_32f_min_aligned16.h delete mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h delete mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h delete mode 100644 volk/include/volk/volk_32f_power_aligned16.h create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h delete mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h delete mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h delete mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h delete mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h create mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_and_aligned16.h delete mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h delete mode 100644 volk/include/volk/volk_32s_or_aligned16.h create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_32u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32u_popcnt_a16.h delete mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h delete mode 100644 volk/include/volk/volk_64f_max_aligned16.h delete mode 100644 volk/include/volk/volk_64f_min_aligned16.h create mode 100644 volk/include/volk/volk_64u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_64u_popcnt_a16.h delete mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h delete mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h create mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 99276ab87..aef1d7ba8 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -41,93 +41,93 @@ volkinclude_HEADERS = \ volk.h \ volk_cpu.h \ volk_environment_init.h \ - volk_16s_add_quad_aligned16.h \ - volk_16s_branch_4_state_8_aligned16.h \ - volk_16sc_deinterleave_16s_aligned16.h \ - volk_16sc_deinterleave_32f_aligned16.h \ - volk_16sc_deinterleave_real_16s_aligned16.h \ - volk_16sc_deinterleave_real_32f_aligned16.h \ - volk_16sc_deinterleave_real_8s_aligned16.h \ - volk_16sc_magnitude_16s_aligned16.h \ - volk_16sc_magnitude_32f_aligned16.h \ - volk_16s_convert_32f_aligned16.h \ - volk_16s_convert_32f_unaligned16.h \ - volk_16s_convert_8s_aligned16.h \ - volk_16s_convert_8s_unaligned16.h \ - volk_16s_max_star_aligned16.h \ - volk_16s_max_star_horizontal_aligned16.h \ - volk_16s_permute_and_scalar_add_aligned16.h \ - volk_16s_quad_max_star_aligned16.h \ - volk_16u_byteswap_aligned16.h \ - volk_32f_accumulator_aligned16.h \ - volk_32f_add_aligned16.h \ - volk_32fc_32f_multiply_aligned16.h \ - volk_32fc_32f_power_32fc_aligned16.h \ - volk_32f_calc_spectral_noise_floor_aligned16.h \ - volk_32fc_atan2_32f_aligned16.h \ - volk_32fc_conjugate_dot_prod_aligned16.h \ - volk_32fc_deinterleave_32f_aligned16.h \ - volk_32fc_deinterleave_64f_aligned16.h \ - volk_32fc_deinterleave_real_16s_aligned16.h \ - volk_32fc_deinterleave_real_32f_aligned16.h \ - volk_32fc_deinterleave_real_64f_aligned16.h \ - volk_32fc_dot_prod_aligned16.h \ - volk_32fc_index_max_aligned16.h \ - volk_32fc_magnitude_16s_aligned16.h \ - volk_32fc_magnitude_32f_aligned16.h \ - volk_32fc_multiply_aligned16.h \ - volk_32f_convert_16s_aligned16.h \ - volk_32f_convert_16s_unaligned16.h \ - volk_32f_convert_32s_aligned16.h \ - volk_32f_convert_32s_unaligned16.h \ - volk_32f_convert_64f_aligned16.h \ - volk_32f_convert_64f_unaligned16.h \ - volk_32f_convert_8s_aligned16.h \ - volk_32f_convert_8s_unaligned16.h \ - volk_32fc_power_spectral_density_32f_aligned16.h \ - volk_32fc_power_spectrum_32f_aligned16.h \ - volk_32fc_square_dist_aligned16.h \ - volk_32fc_square_dist_scalar_mult_aligned16.h \ - volk_32f_divide_aligned16.h \ - volk_32f_dot_prod_aligned16.h \ - volk_32f_dot_prod_unaligned16.h \ - volk_32f_fm_detect_aligned16.h \ - volk_32f_index_max_aligned16.h \ - volk_32f_interleave_16sc_aligned16.h \ - volk_32f_interleave_32fc_aligned16.h \ - volk_32f_max_aligned16.h \ - volk_32f_min_aligned16.h \ - volk_32f_multiply_aligned16.h \ - volk_32f_normalize_aligned16.h \ - volk_32f_power_aligned16.h \ - volk_32f_sqrt_aligned16.h \ - volk_32f_stddev_aligned16.h \ - volk_32f_stddev_and_mean_aligned16.h \ - volk_32f_subtract_aligned16.h \ - volk_32f_sum_of_poly_aligned16.h \ - volk_32s_and_aligned16.h \ - volk_32s_convert_32f_aligned16.h \ - volk_32s_convert_32f_unaligned16.h \ - volk_32s_or_aligned16.h \ - volk_32u_byteswap_aligned16.h \ - volk_32u_popcnt_aligned16.h \ - volk_64f_convert_32f_aligned16.h \ - volk_64f_convert_32f_unaligned16.h \ - volk_64f_max_aligned16.h \ - volk_64f_min_aligned16.h \ - volk_64u_byteswap_aligned16.h \ - volk_64u_popcnt_aligned16.h \ - volk_8sc_deinterleave_16s_aligned16.h \ - volk_8sc_deinterleave_32f_aligned16.h \ - volk_8sc_deinterleave_real_16s_aligned16.h \ - volk_8sc_deinterleave_real_32f_aligned16.h \ - volk_8sc_deinterleave_real_8s_aligned16.h \ - volk_8sc_multiply_conjugate_16sc_aligned16.h \ - volk_8sc_multiply_conjugate_32fc_aligned16.h \ - volk_8s_convert_16s_aligned16.h \ - volk_8s_convert_16s_unaligned16.h \ - volk_8s_convert_32f_aligned16.h \ - volk_8s_convert_32f_unaligned16.h + volk_16s_add_quad_a16.h \ + volk_16s_branch_4_state_8_a16.h \ + volk_16sc_deinterleave_16s_16s_a16.h \ + volk_16sc_s32f_deinterleave_32f_32f_a16.h \ + volk_16sc_deinterleave_real_16s_a16.h \ + volk_16sc_s32f_deinterleave_real_32f_a16.h \ + volk_16sc_deinterleave_real_8s_a16.h \ + volk_16sc_magnitude_16s_a16.h \ + volk_16sc_s32f_magnitude_32f_a16.h \ + volk_16s_s32f_convert_32f_a16.h \ + volk_16s_s32f_convert_32f_ua16.h \ + volk_16s_convert_8s_a16.h \ + volk_16s_convert_8s_ua16.h \ + volk_16s_max_star_16s_a16.h \ + volk_16s_max_star_horizontal_16s_a16.h \ + volk_16s_permute_and_scalar_add_a16.h \ + volk_16s_quad_max_star_16s_a16.h \ + volk_16u_byteswap_a16.h \ + volk_32f_accumulator_s32f_a16.h \ + volk_32f_32f_add_32f_a16.h \ + volk_32fc_32f_multiply_32fc_a16.h \ + volk_32fc_32f_power_32fc_a16.h \ + volk_32f_calc_spectral_noise_floor_a16.h \ + volk_32fc_s32f_atan2_32f_a16.h \ + volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_deinterleave_32f_32f_a16.h \ + volk_32fc_deinterleave_64f_64f_a16.h \ + volk_32fc_deinterleave_real_16s_a16.h \ + volk_32fc_deinterleave_real_32f_a16.h \ + volk_32fc_deinterleave_real_64f_a16.h \ + volk_32fc_32fc_dot_prod_32fc_a16.h \ + volk_32fc_index_max_16u_a16.h \ + volk_32fc_s32f_magnitude_16s_a16.h \ + volk_32fc_magnitude_32f_a16.h \ + volk_32fc_32fc_multiply_32fc_a16.h \ + volk_32f_s32f_convert_16s_a16.h \ + volk_32f_s32f_convert_16s_ua16.h \ + volk_32f_s32f_convert_32s_a16.h \ + volk_32f_s32f_convert_32s_ua16.h \ + volk_32f_convert_64f_a16.h \ + volk_32f_convert_64f_ua16.h \ + volk_32f_s32f_convert_8s_a16.h \ + volk_32f_s32f_convert_8s_ua16.h \ + volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \ + volk_32fc_s32f_power_spectrum_32f_a16.h \ + volk_32fc_32fc_square_dist_32f_a16.h \ + volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \ + volk_32f_32f_divide_32f_a16.h \ + volk_32f_32f_dot_prod_32f_a16.h \ + volk_32f_32f_dot_prod_32f_ua16.h \ + volk_32f_s32f_32f_fm_detect_32f_a16.h \ + volk_32f_index_max_16u_a16.h \ + volk_32f_32f_s32f_interleave_16sc_a16.h \ + volk_32f_32f_interleave_32fc_a16.h \ + volk_32f_32f_max_32f_a16.h \ + volk_32f_32f_min_32f_a16.h \ + volk_32f_32f_multiply_32f_a16.h \ + volk_32f_s32f_normalize_a16.h \ + volk_32f_s32f_power_32f_a16.h \ + volk_32f_sqrt_32f_a16.h \ + volk_32f_s32f_stddev_32f_a16.h \ + volk_32f_stddev_and_mean_32f_32f_a16.h \ + volk_32f_32f_subtract_32f_a16.h \ + volk_32f_32f_32f_sum_of_poly_32f_a16.h \ + volk_32s_32s_and_32s_a16.h \ + volk_32s_s32f_convert_32f_a16.h \ + volk_32s_s32f_convert_32f_ua16.h \ + volk_32s_32s_or_32s_a16.h \ + volk_32u_byteswap_a16.h \ + volk_32u_popcnt_a16.h \ + volk_64f_convert_32f_a16.h \ + volk_64f_convert_32f_ua16.h \ + volk_64f_64f_max_64f_a16.h \ + volk_64f_64f_min_64f_a16.h \ + volk_64u_byteswap_a16.h \ + volk_64u_popcnt_a16.h \ + volk_8sc_deinterleave_16s_16s_a16.h \ + volk_8sc_s32f_deinterleave_32f_32f_a16.h \ + volk_8sc_deinterleave_real_16s_a16.h \ + volk_8sc_s32f_deinterleave_real_32f_a16.h \ + volk_8sc_deinterleave_real_8s_a16.h \ + volk_8sc_8sc_multiply_conjugate_16sc_a16.h \ + volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \ + volk_8s_convert_16s_a16.h \ + volk_8s_convert_16s_ua16.h \ + volk_8s_s32f_convert_32f_a16.h \ + volk_8s_s32f_convert_32f_ua16.h VOLK_MKTABLES_SOURCES = \ $(top_srcdir)/lib/volk_rank_archs.c \ diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f2432d7a4..f708ba7d0 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -24,8 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = tempstring + " }\n" tempstring = tempstring + " return 0;\n" tempstring = tempstring + "}\n" - - + for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h new file mode 100644 index 000000000..67d0c55a3 --- /dev/null +++ b/volk/include/volk/volk_16s_add_quad_a16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16s_add_quad_a16_H +#define INCLUDED_volk_16s_add_quad_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16s_add_quad_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16s_add_quad_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t" + ".%=volk_16s_add_quad_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16s_add_quad_a16_H*/ diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h deleted file mode 100644 index 63042bef1..000000000 --- a/volk/include/volk/volk_16s_add_quad_aligned16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H -#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 -#include -#include - -static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t" - ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC - -static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h new file mode 100644 index 000000000..4c1af8729 --- /dev/null +++ b/volk/include/volk/volk_16s_branch_4_state_8_a16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H +#define INCLUDED_volk_16s_branch_4_state_8_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16s_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h deleted file mode 100644 index fb9d7cb87..000000000 --- a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H -#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H - - -#include -#include - - - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#if LV_HAVE_GENERIC -static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h deleted file mode 100644 index 126ce1528..000000000 --- a/volk/include/volk/volk_16s_convert_32f_aligned16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h deleted file mode 100644 index d6212fba5..000000000 --- a/volk/include/volk/volk_16s_convert_32f_unaligned16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h new file mode 100644 index 000000000..13db435de --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_a16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16s_convert_8s_a16_H +#define INCLUDED_volk_16s_convert_8s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h deleted file mode 100644 index 64c368688..000000000 --- a/volk/include/volk/volk_16s_convert_8s_aligned16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h new file mode 100644 index 000000000..9941118ae --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_ua16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_16s_convert_8s_ua16_H +#define INCLUDED_volk_16s_convert_8s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h deleted file mode 100644 index ca925de86..000000000 --- a/volk/include/volk/volk_16s_convert_8s_unaligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h new file mode 100644 index 000000000..b2ec90552 --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_16s_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16s_max_star_16s_a16_H +#define INCLUDED_volk_16s_max_star_16s_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h deleted file mode 100644 index ba4e979ec..000000000 --- a/volk/include/volk/volk_16s_max_star_aligned16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H -#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h new file mode 100644 index 000000000..68994593b --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H +#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h deleted file mode 100644 index 82d011677..000000000 --- a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H -#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h new file mode 100644 index 000000000..2e7586b57 --- /dev/null +++ b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16s_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h deleted file mode 100644 index 452d05c4f..000000000 --- a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H -#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H - - -#include -#include - - - - -#if LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h new file mode 100644 index 000000000..3e89ff963 --- /dev/null +++ b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H +#define INCLUDED_volk_16s_quad_max_star_16s_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t" + + "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h deleted file mode 100644 index 1004c4d23..000000000 --- a/volk/include/volk/volk_16s_quad_max_star_aligned16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H -#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 - -#include - -static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16s_quad_max_star_aligned16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t" - - "volk_16s_quad_max_star_aligned16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..8f9b44478 --- /dev/null +++ b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H +#define INCLUDED_volk_16s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..ad52aea1a --- /dev/null +++ b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h new file mode 100644 index 000000000..8e5da24ec --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H +#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h deleted file mode 100644 index cf94a3f38..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h deleted file mode 100644 index 50b8b62d5..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..068c1350c --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index b594c85b8..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 3e7be1e64..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h new file mode 100644 index 000000000..afa21ebc4 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H +#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index 2dd85a422..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h new file mode 100644 index 000000000..d832de5fe --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_16s_a16.h @@ -0,0 +1,190 @@ +#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H +#define INCLUDED_volk_16sc_magnitude_16s_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h deleted file mode 100644 index 41e8751d6..000000000 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,190 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h deleted file mode 100644 index c2605d551..000000000 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H -#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..53e4253c4 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..7320db368 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h new file mode 100644 index 000000000..649b5cc96 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h new file mode 100644 index 000000000..c8128dbab --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_16u_byteswap_a16_H +#define INCLUDED_volk_16u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h deleted file mode 100644 index 9d19d1a45..000000000 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h new file mode 100644 index 000000000..a0f97f94e --- /dev/null +++ b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h new file mode 100644 index 000000000..ba38c310f --- /dev/null +++ b/volk/include/volk/volk_32f_32f_add_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H +#define INCLUDED_volk_32f_32f_add_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h new file mode 100644 index 000000000..a0995e631 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_divide_32f_a16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H +#define INCLUDED_volk_32f_32f_divide_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h new file mode 100644 index 000000000..63f5221d3 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h new file mode 100644 index 000000000..b5fa7d7a4 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H +#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/ diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h new file mode 100644 index 000000000..34ea93349 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H +#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h new file mode 100644 index 000000000..8ca7a5ba8 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_max_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H +#define INCLUDED_volk_32f_32f_max_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h new file mode 100644 index 000000000..dd05988be --- /dev/null +++ b/volk/include/volk/volk_32f_32f_min_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H +#define INCLUDED_volk_32f_32f_min_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h new file mode 100644 index 000000000..2d004db10 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H +#define INCLUDED_volk_32f_32f_multiply_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h new file mode 100644 index 000000000..207382a19 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H +#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h new file mode 100644 index 000000000..9fea6aa27 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H +#define INCLUDED_volk_32f_32f_subtract_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h deleted file mode 100644 index 7e395cf50..000000000 --- a/volk/include/volk/volk_32f_accumulator_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H -#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float tempBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h new file mode 100644 index 000000000..4a3588e6d --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H +#define INCLUDED_volk_32f_accumulator_s32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float tempBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h deleted file mode 100644 index e7d8de265..000000000 --- a/volk/include/volk/volk_32f_add_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H -#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h new file mode 100644 index 000000000..fce77cd04 --- /dev/null +++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H +#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h deleted file mode 100644 index ff917525f..000000000 --- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h +++ /dev/null @@ -1,167 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H -#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h deleted file mode 100644 index 7fbabd9c3..000000000 --- a/volk/include/volk/volk_32f_convert_16s_aligned16.h +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h deleted file mode 100644 index d2bbdf13a..000000000 --- a/volk/include/volk/volk_32f_convert_16s_unaligned16.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h deleted file mode 100644 index 011ef5d0e..000000000 --- a/volk/include/volk/volk_32f_convert_32s_aligned16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h deleted file mode 100644 index a6df826c7..000000000 --- a/volk/include/volk/volk_32f_convert_32s_unaligned16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h new file mode 100644 index 000000000..c303dc118 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_a16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_a16_H +#define INCLUDED_volk_32f_convert_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h deleted file mode 100644 index 91a855813..000000000 --- a/volk/include/volk/volk_32f_convert_64f_aligned16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h new file mode 100644 index 000000000..c8de768dc --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_ua16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_ua16_H +#define INCLUDED_volk_32f_convert_64f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h deleted file mode 100644 index 698e0d446..000000000 --- a/volk/include/volk/volk_32f_convert_64f_unaligned16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h deleted file mode 100644 index b9487b622..000000000 --- a/volk/include/volk/volk_32f_convert_8s_aligned16.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h deleted file mode 100644 index e986dbc87..000000000 --- a/volk/include/volk/volk_32f_convert_8s_unaligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h deleted file mode 100644 index c595b5e92..000000000 --- a/volk/include/volk/volk_32f_divide_aligned16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H -#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h deleted file mode 100644 index 3aee1136a..000000000 --- a/volk/include/volk/volk_32f_dot_prod_aligned16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h deleted file mode 100644 index bce6aa15f..000000000 --- a/volk/include/volk/volk_32f_dot_prod_unaligned16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H -#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h deleted file mode 100644 index c82239d74..000000000 --- a/volk/include/volk/volk_32f_fm_detect_aligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H -#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_noints The number of real values in the input vector. -*/ -static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int quarterPoints = (num_points-1) / 4; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m128 upperBound = _mm_set_ps1(bound); - __m128 lowerBound = _mm_set_ps1(-bound); - __m128 next3old1; - __m128 next4; - __m128 boundAdjust; - __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. - __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. - // Do the first 4 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < quarterPoints; number++) { - // Load data - next3old1 = _mm_loadu_ps((float*) (inPtr-1)); - next4 = _mm_load_ps(inPtr); - inPtr += 4; - // Subtract and store: - next3old1 = _mm_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); - boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm_cmplt_ps(next3old1, lowerBound); - next4 = _mm_and_ps(next4, negBoundAdjust); - boundAdjust = _mm_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm_add_ps(next3old1, boundAdjust); - _mm_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 4; - } - - for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_points The number of real values in the input vector. -*/ -static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 0; - float* outPtr = outputVector; - const float* inPtr = inputVector; - - // Do the first 1 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - - for (number = 1; number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h new file mode 100644 index 000000000..d070e17d5 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_volk_32f_index_max_16u_a16_H +#define INCLUDED_volk_32f_index_max_16u_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#if LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_GENERIC +static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h deleted file mode 100644 index 26322bfa2..000000000 --- a/volk/include/volk/volk_32f_index_max_aligned16.h +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#if LV_HAVE_SSE -#include - -static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_GENERIC -static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - float max = src0[0]; - unsigned int index = 0; - - int i = 1; - - for(; i < num_points; ++i) { - - if(src0[i] > max){ - index = i; - max = src0[i]; - } - - } - target[0] = index; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h deleted file mode 100644 index 476946b88..000000000 --- a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H -#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h deleted file mode 100644 index 859c6a0ef..000000000 --- a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H -#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h deleted file mode 100644 index d4e30fba8..000000000 --- a/volk/include/volk/volk_32f_max_aligned16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h deleted file mode 100644 index 55daafb6a..000000000 --- a/volk/include/volk/volk_32f_min_aligned16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H -#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h deleted file mode 100644 index 87ae7bcf8..000000000 --- a/volk/include/volk/volk_32f_multiply_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h deleted file mode 100644 index 323d0513c..000000000 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H -#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) - \param vecBuffer The buffer of values to be vectorized - \param num_points The number of values in vecBuffer - \param scalar The scale value to be applied to each buffer value -*/ -static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - - const float invScalar = 1.0 / scalar; - __m128 vecScalar = _mm_set_ps1(invScalar); - - __m128 input1; - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - - input1 = _mm_load_ps(inputPtr); - - input1 = _mm_mul_ps(input1, vecScalar); - - _mm_store_ps(inputPtr, input1); - - inputPtr += 4; - } - - number = quarterPoints*4; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h deleted file mode 100644 index 2ecd8eecb..000000000 --- a/volk/include/volk/volk_32f_power_aligned16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H -#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h new file mode 100644 index 000000000..ff4d5b19c --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h new file mode 100644 index 000000000..cf51cf9c5 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H +#define INCLUDED_volk_32f_s32f_convert_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h new file mode 100644 index 000000000..53d159f82 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h new file mode 100644 index 000000000..0be649418 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H +#define INCLUDED_volk_32f_s32f_convert_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h new file mode 100644 index 000000000..efb2c3a20 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h new file mode 100644 index 000000000..69ccec5c6 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H +#define INCLUDED_volk_32f_s32f_convert_8s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h new file mode 100644 index 000000000..af1652b19 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h new file mode 100644 index 000000000..0850cddf7 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_normalize_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H +#define INCLUDED_volk_32f_s32f_normalize_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h new file mode 100644 index 000000000..3ed594d9a --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_power_32f_a16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H +#define INCLUDED_volk_32f_s32f_power_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h new file mode 100644 index 000000000..32f4fa067 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h new file mode 100644 index 000000000..513c2cffe --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_32f_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H +#define INCLUDED_volk_32f_sqrt_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h deleted file mode 100644 index f6996ad5f..000000000 --- a/volk/include/volk/volk_32f_sqrt_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H -#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int); -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points); -} - -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h deleted file mode 100644 index 1c6a08437..000000000 --- a/volk/include/volk/volk_32f_stddev_aligned16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H -#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h new file mode 100644 index 000000000..2ba809845 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h deleted file mode 100644 index 1cd502257..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H -#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h deleted file mode 100644 index e15242901..000000000 --- a/volk/include/volk/volk_32f_subtract_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H -#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h deleted file mode 100644 index a326e62b1..000000000 --- a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H -#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h new file mode 100644 index 000000000..514998800 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h deleted file mode 100644 index 304ed8e2d..000000000 --- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies the input complex vector with the input float vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the float values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; - for(;number < quarterPoints; number++){ - - aVal1 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - bVal = _mm_load_ps(bPtr); - bPtr += 4; - - bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); - bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - - cVal = _mm_mul_ps(aVal1, bVal1); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - cPtr += 2; - - cVal = _mm_mul_ps(aVal2, bVal2); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr); - bPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - -#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h new file mode 100644 index 000000000..6f9e9e3ee --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_32f_power_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h deleted file mode 100644 index 2d71ee4f8..000000000 --- a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H -#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; - for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h new file mode 100644 index 000000000..cd9cc8160 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h new file mode 100644 index 000000000..2ccfcf2f2 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h new file mode 100644 index 000000000..59259882c --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h new file mode 100644 index 000000000..14f511697 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h new file mode 100644 index 000000000..b6c72adbf --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h deleted file mode 100644 index df0ebb987..000000000 --- a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H -#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 mask; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_and_ps(phase, keepMask); - mask = _mm_andnot_ps(keepMask, correctVector); - phase = _mm_or_ps(phase, mask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The vector where the results will be stored. - \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ - float* outPtr = outputVector; - const float* inPtr = (float*)inputVector; - const float invNormalizeFactor = 1.0 / normalizeFactor; - unsigned int number; - for ( number = 0; number < num_points; number++) { - const float real = *inPtr++; - const float imag = *inPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h deleted file mode 100644 index 60103c1b5..000000000 --- a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..3ee579c2e --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h deleted file mode 100644 index 02085cd1e..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h new file mode 100644 index 000000000..404defc36 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h deleted file mode 100644 index 3d9ebccdd..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..53235e5f7 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index 3026b2422..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..9838ec88b --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 2af973bcc..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; - - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h new file mode 100644 index 000000000..af392d074 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h deleted file mode 100644 index f408589c4..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - iBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h deleted file mode 100644 index 1a834dc25..000000000 --- a/volk/include/volk/volk_32fc_dot_prod_aligned16.h +++ /dev/null @@ -1,468 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H - -#include -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h new file mode 100644 index 000000000..532ae4e7c --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_16u_a16.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H +#define INCLUDED_volk_32fc_index_max_16u_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h deleted file mode 100644 index d77a95f90..000000000 --- a/volk/include/volk/volk_32fc_index_max_aligned16.h +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - - -static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; - - - - - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); - - - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); -; - - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - - for(; i < bound; ++i) { - - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); - - - src0 += 4; - - - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); - - - xmm1 = _mm_hadd_ps(xmm1, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - - } - - - for(i = 0; i < leftovers0; ++i) { - - - xmm2 = _mm_load_ps((float*)src0); - - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; - - xmm2 = _mm_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm_hadd_ps(xmm2, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - } - - - - - for(i = 0; i < leftovers1; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - xmm2 = _mm_load1_ps(&sq_dist); - - xmm1 = xmm3; - - xmm3 = _mm_max_ss(xmm3, xmm2); - - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - } - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - unsigned int l0 = g0 ^ 1; - - unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - unsigned int l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ - -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - float sq_dist = 0.0; - float max = 0.0; - unsigned int index = 0; - - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - - - } - target[0] = index; - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h deleted file mode 100644 index 4e590e120..000000000 --- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h new file mode 100644 index 000000000..be7216dce --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_a16.h @@ -0,0 +1,132 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H +#define INCLUDED_volk_32fc_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h deleted file mode 100644 index 3ea62da6a..000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h deleted file mode 100644 index c8f2418c3..000000000 --- a/volk/include/volk/volk_32fc_multiply_aligned16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); -static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h deleted file mode 100644 index 52ec0f95b..000000000 --- a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h deleted file mode 100644 index 645629b9d..000000000 --- a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided agains all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - const float iNormalizationFactor = 1.0 / normalizationFactor; - unsigned int point; - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h new file mode 100644 index 000000000..e9f74438d --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h new file mode 100644 index 000000000..dc3c6741a --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h new file mode 100644 index 000000000..39d8f7aa2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h new file mode 100644 index 000000000..29ccdaef7 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h deleted file mode 100644 index 6458ea4dd..000000000 --- a/volk/include/volk/volk_32fc_square_dist_aligned16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H -#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h deleted file mode 100644 index 0fcc86f1e..000000000 --- a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H -#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h new file mode 100644 index 000000000..0e8380757 --- /dev/null +++ b/volk/include/volk/volk_32s_32s_and_32s_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H +#define INCLUDED_volk_32s_32s_and_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h new file mode 100644 index 000000000..2dcf2e551 --- /dev/null +++ b/volk/include/volk/volk_32s_32s_or_32s_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H +#define INCLUDED_volk_32s_32s_or_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h deleted file mode 100644 index 16c63fd48..000000000 --- a/volk/include/volk/volk_32s_and_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H -#define INCLUDED_VOLK_32s_AND_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h deleted file mode 100644 index a407e68bd..000000000 --- a/volk/include/volk/volk_32s_convert_32f_aligned16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h deleted file mode 100644 index ad7d4eb17..000000000 --- a/volk/include/volk/volk_32s_convert_32f_unaligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h deleted file mode 100644 index 64748d535..000000000 --- a/volk/include/volk/volk_32s_or_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H -#define INCLUDED_VOLK_32s_OR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..c16ecc9dd --- /dev/null +++ b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H +#define INCLUDED_volk_32s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..4eb5a5b85 --- /dev/null +++ b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h new file mode 100644 index 000000000..7556ec7b1 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_a16_H +#define INCLUDED_volk_32u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h deleted file mode 100644 index 09173a9d5..000000000 --- a/volk/include/volk/volk_32u_byteswap_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h new file mode 100644 index 000000000..f6e25e4e8 --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_a16.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H +#define INCLUDED_VOLK_32u_POPCNT_A16_H + +#include +#include + + +#if LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h deleted file mode 100644 index 37cfd112c..000000000 --- a/volk/include/volk/volk_32u_popcnt_aligned16.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H -#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - -static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) { - - // This is faster than a lookup table - uint32_t retVal = value; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - - *ret = retVal; -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 - -#include - -static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) { - *ret = _mm_popcnt_u32(value); -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h new file mode 100644 index 000000000..7e091851f --- /dev/null +++ b/volk/include/volk/volk_64f_64f_max_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H +#define INCLUDED_volk_64f_64f_max_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h new file mode 100644 index 000000000..f2bcbe83b --- /dev/null +++ b/volk/include/volk/volk_64f_64f_min_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H +#define INCLUDED_volk_64f_64f_min_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h new file mode 100644 index 000000000..7dca065f0 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_a16_H +#define INCLUDED_volk_64f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h deleted file mode 100644 index 44df66104..000000000 --- a/volk/include/volk/volk_64f_convert_32f_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h new file mode 100644 index 000000000..7774db1b7 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_ua16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_ua16_H +#define INCLUDED_volk_64f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h deleted file mode 100644 index 08cfb6127..000000000 --- a/volk/include/volk/volk_64f_convert_32f_unaligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h deleted file mode 100644 index ce4907a8c..000000000 --- a/volk/include/volk/volk_64f_max_aligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H -#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h deleted file mode 100644 index acf4d6b2a..000000000 --- a/volk/include/volk/volk_64f_min_aligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H -#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h new file mode 100644 index 000000000..0eefe0138 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_a16.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_a16_H +#define INCLUDED_volk_64u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h deleted file mode 100644 index d5e1b6f30..000000000 --- a/volk/include/volk/volk_64u_byteswap_aligned16.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h new file mode 100644 index 000000000..59511dc29 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -0,0 +1,50 @@ +#ifndef INCLUDED_volk_64u_popcnt_a16_H +#define INCLUDED_volk_64u_popcnt_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 && LV_HAVE_64 + +#include + +static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { + *ret = _mm_popcnt_u64(value); + +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h deleted file mode 100644 index 4d62f9375..000000000 --- a/volk/include/volk/volk_64u_popcnt_aligned16.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H -#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) { - - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; - -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 && LV_HAVE_64 - -#include - -static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { - *ret = _mm_popcnt_u64(value); - -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h new file mode 100644 index 000000000..38efdb6a3 --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_a16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8s_convert_16s_a16_H +#define INCLUDED_volk_8s_convert_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h deleted file mode 100644 index c52c64eae..000000000 --- a/volk/include/volk/volk_8s_convert_16s_aligned16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h new file mode 100644 index 000000000..a726bfb5e --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_ua16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_8s_convert_16s_ua16_H +#define INCLUDED_volk_8s_convert_16s_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h deleted file mode 100644 index 05b916cea..000000000 --- a/volk/include/volk/volk_8s_convert_16s_unaligned16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h deleted file mode 100644 index 700a0fa42..000000000 --- a/volk/include/volk/volk_8s_convert_32f_aligned16.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h deleted file mode 100644 index 8019aac9a..000000000 --- a/volk/include/volk/volk_8s_convert_32f_unaligned16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..45185ac2e --- /dev/null +++ b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h @@ -0,0 +1,105 @@ +#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H +#define INCLUDED_volk_8s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..310824580 --- /dev/null +++ b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h new file mode 100644 index 000000000..eae1185ec --- /dev/null +++ b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H +#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */ diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h new file mode 100644 index 000000000..621276b08 --- /dev/null +++ b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h new file mode 100644 index 000000000..6a35e969d --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H +#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h deleted file mode 100644 index 38eaa49ea..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h deleted file mode 100644 index d0c118965..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..67ffebd99 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index d0cb49494..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index c849448ea..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h new file mode 100644 index 000000000..ecffc092e --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index d84d64568..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h deleted file mode 100644 index 470a67539..000000000 --- a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H -#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h deleted file mode 100644 index 52b444cf7..000000000 --- a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H -#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..cedbf202c --- /dev/null +++ b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..902795131 --- /dev/null +++ b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 9fded9a3e..fc1ec10ef 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -55,7 +55,7 @@ functions = []; for line in mfile: - subline = re.search(".*(aligned).*", line); + subline = re.search(".*(a16).*", line); if subline: subsubline = re.search("(?<=volk_).*", subline.group(0)); if subsubline: @@ -70,11 +70,10 @@ datatypes = set(datatypes); for line in mfile: for dt in datatypes: if dt in line: - subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line); + subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line); if subline: subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); archs = []; -- cgit From e3600f59e76c3dc08aedfd77629b7c5c48df86af Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 20 Jan 2011 16:30:09 -0800 Subject: volk: renamed all files. added all tests. some test things are still broken. --- volk/include/volk/Makefile.am | 128 +++--- volk/include/volk/make_c.py | 2 +- volk/include/volk/volk_16i_branch_4_state_8_a16.h | 194 +++++++++ volk/include/volk/volk_16i_convert_8i_a16.h | 69 +++ volk/include/volk/volk_16i_convert_8i_u.h | 71 ++++ volk/include/volk/volk_16i_max_star_16i_a16.h | 108 +++++ .../volk/volk_16i_max_star_horizontal_16i_a16.h | 130 ++++++ .../volk/volk_16i_permute_and_scalar_add_a16.h | 139 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 119 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_u.h | 122 ++++++ .../volk/volk_16i_x4_quad_max_star_16i_a16.h | 191 +++++++++ .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ++++++ .../volk/volk_16ic_deinterleave_16i_x2_a16.h | 158 +++++++ .../volk/volk_16ic_deinterleave_real_16i_a16.h | 120 ++++++ .../volk/volk_16ic_deinterleave_real_8i_a16.h | 94 +++++ volk/include/volk/volk_16ic_magnitude_16i_a16.h | 190 +++++++++ .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 108 +++++ .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 125 ++++++ .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 179 ++++++++ volk/include/volk/volk_16s_add_quad_a16.h | 136 ------ volk/include/volk/volk_16s_branch_4_state_8_a16.h | 194 --------- volk/include/volk/volk_16s_convert_8s_a16.h | 69 --- volk/include/volk/volk_16s_convert_8s_ua16.h | 71 ---- volk/include/volk/volk_16s_max_star_16s_a16.h | 108 ----- .../volk/volk_16s_max_star_horizontal_16s_a16.h | 130 ------ .../volk/volk_16s_permute_and_scalar_add_a16.h | 139 ------ volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 --------- volk/include/volk/volk_16s_s32f_convert_32f_a16.h | 119 ------ volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ------ .../volk/volk_16sc_deinterleave_16s_16s_a16.h | 158 ------- .../volk/volk_16sc_deinterleave_real_16s_a16.h | 120 ------ .../volk/volk_16sc_deinterleave_real_8s_a16.h | 94 ----- volk/include/volk/volk_16sc_magnitude_16s_a16.h | 190 --------- .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 ----- .../volk_16sc_s32f_deinterleave_real_32f_a16.h | 125 ------ .../volk/volk_16sc_s32f_magnitude_32f_a16.h | 179 -------- .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h | 151 ------- volk/include/volk/volk_32f_32f_add_32f_a16.h | 81 ---- volk/include/volk/volk_32f_32f_divide_32f_a16.h | 82 ---- volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h | 184 -------- volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 -------- .../volk/volk_32f_32f_interleave_32fc_a16.h | 75 ---- volk/include/volk/volk_32f_32f_max_32f_a16.h | 85 ---- volk/include/volk/volk_32f_32f_min_32f_a16.h | 85 ---- volk/include/volk/volk_32f_32f_multiply_32f_a16.h | 81 ---- .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h | 155 ------- volk/include/volk/volk_32f_32f_subtract_32f_a16.h | 81 ---- volk/include/volk/volk_32f_convert_64f_u.h | 70 +++ volk/include/volk/volk_32f_convert_64f_ua16.h | 70 --- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 110 +++++ volk/include/volk/volk_32f_s32f_convert_16i_u.h | 113 +++++ volk/include/volk/volk_32f_s32f_convert_16s_a16.h | 110 ----- volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 ----- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 106 +++++ volk/include/volk/volk_32f_s32f_convert_32i_u.h | 109 +++++ volk/include/volk/volk_32f_s32f_convert_32s_a16.h | 106 ----- volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 ----- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 117 ++++++ volk/include/volk/volk_32f_s32f_convert_8i_u.h | 120 ++++++ volk/include/volk/volk_32f_s32f_convert_8s_a16.h | 117 ------ volk/include/volk/volk_32f_s32f_convert_8s_ua16.h | 120 ------ .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h | 169 -------- .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 169 ++++++++ volk/include/volk/volk_32f_x2_add_32f_a16.h | 81 ++++ volk/include/volk/volk_32f_x2_divide_32f_a16.h | 82 ++++ volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 184 ++++++++ volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 184 ++++++++ .../include/volk/volk_32f_x2_interleave_32fc_a16.h | 75 ++++ volk/include/volk/volk_32f_x2_max_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_x2_min_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 81 ++++ .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 155 +++++++ volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 81 ++++ .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 +++++++ .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h | 344 --------------- .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h | 468 --------------------- .../volk/volk_32fc_32fc_multiply_32fc_a16.h | 95 ----- ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------ .../volk/volk_32fc_32fc_square_dist_32f_a16.h | 112 ----- .../volk/volk_32fc_deinterleave_32f_32f_a16.h | 75 ---- .../volk/volk_32fc_deinterleave_32f_x2_a16.h | 75 ++++ .../volk/volk_32fc_deinterleave_64f_64f_a16.h | 78 ---- .../volk/volk_32fc_deinterleave_64f_x2_a16.h | 78 ++++ .../volk/volk_32fc_deinterleave_real_16i_a16.h | 80 ++++ .../volk/volk_32fc_deinterleave_real_16s_a16.h | 80 ---- .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 158 +++++++ .../volk/volk_32fc_s32f_magnitude_16s_a16.h | 158 ------- ...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ------ ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ++++++ .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 344 +++++++++++++++ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 95 +++++ ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++ .../volk/volk_32fc_x2_square_dist_32f_a16.h | 112 +++++ volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 73 ++++ volk/include/volk/volk_32i_s32f_convert_32f_u.h | 75 ++++ volk/include/volk/volk_32i_x2_and_32i_a16.h | 81 ++++ volk/include/volk/volk_32i_x2_or_32i_a16.h | 81 ++++ volk/include/volk/volk_32s_32s_and_32s_a16.h | 81 ---- volk/include/volk/volk_32s_32s_or_32s_a16.h | 81 ---- volk/include/volk/volk_32s_s32f_convert_32f_a16.h | 73 ---- volk/include/volk/volk_32s_s32f_convert_32f_ua16.h | 75 ---- volk/include/volk/volk_64f_64f_max_64f_a16.h | 71 ---- volk/include/volk/volk_64f_64f_min_64f_a16.h | 71 ---- volk/include/volk/volk_64f_convert_32f_u.h | 67 +++ volk/include/volk/volk_64f_convert_32f_ua16.h | 67 --- volk/include/volk/volk_64f_x2_max_64f_a16.h | 71 ++++ volk/include/volk/volk_64f_x2_min_64f_a16.h | 71 ++++ volk/include/volk/volk_8i_convert_16i_a16.h | 83 ++++ volk/include/volk/volk_8i_convert_16i_u.h | 73 ++++ volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 105 +++++ volk/include/volk/volk_8i_s32f_convert_32f_u.h | 94 +++++ .../volk/volk_8ic_deinterleave_16i_x2_a16.h | 77 ++++ .../volk/volk_8ic_deinterleave_real_16i_a16.h | 66 +++ .../volk/volk_8ic_deinterleave_real_8i_a16.h | 67 +++ .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 164 ++++++++ .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 133 ++++++ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 102 +++++ .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++ volk/include/volk/volk_8s_convert_16s_a16.h | 83 ---- volk/include/volk/volk_8s_convert_16s_ua16.h | 73 ---- volk/include/volk/volk_8s_s32f_convert_32f_a16.h | 105 ----- volk/include/volk/volk_8s_s32f_convert_32f_ua16.h | 94 ----- .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h | 102 ----- ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ------ .../volk/volk_8sc_deinterleave_16s_16s_a16.h | 77 ---- .../volk/volk_8sc_deinterleave_real_16s_a16.h | 66 --- .../volk/volk_8sc_deinterleave_real_8s_a16.h | 67 --- .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h | 164 -------- .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ------ volk/include/volk/volk_register.py | 4 +- 131 files changed, 7753 insertions(+), 7753 deletions(-) create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_u.h create mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_16s_add_quad_a16.h delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h delete mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h delete mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_u.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h delete mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h delete mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_u.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_u.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h delete mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index aef1d7ba8..43c8ae9df 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -41,93 +41,93 @@ volkinclude_HEADERS = \ volk.h \ volk_cpu.h \ volk_environment_init.h \ - volk_16s_add_quad_a16.h \ - volk_16s_branch_4_state_8_a16.h \ - volk_16sc_deinterleave_16s_16s_a16.h \ - volk_16sc_s32f_deinterleave_32f_32f_a16.h \ - volk_16sc_deinterleave_real_16s_a16.h \ - volk_16sc_s32f_deinterleave_real_32f_a16.h \ - volk_16sc_deinterleave_real_8s_a16.h \ - volk_16sc_magnitude_16s_a16.h \ - volk_16sc_s32f_magnitude_32f_a16.h \ - volk_16s_s32f_convert_32f_a16.h \ - volk_16s_s32f_convert_32f_ua16.h \ - volk_16s_convert_8s_a16.h \ - volk_16s_convert_8s_ua16.h \ - volk_16s_max_star_16s_a16.h \ - volk_16s_max_star_horizontal_16s_a16.h \ - volk_16s_permute_and_scalar_add_a16.h \ - volk_16s_quad_max_star_16s_a16.h \ + volk_16i_x5_add_quad_16i_x4_a16.h \ + volk_16i_branch_4_state_8_a16.h \ + volk_16ic_deinterleave_16i_x2_a16.h \ + volk_16ic_s32f_deinterleave_32f_x2_a16.h \ + volk_16ic_deinterleave_real_16i_a16.h \ + volk_16ic_s32f_deinterleave_real_32f_a16.h \ + volk_16ic_deinterleave_real_8i_a16.h \ + volk_16ic_magnitude_16i_a16.h \ + volk_16ic_s32f_magnitude_32f_a16.h \ + volk_16i_s32f_convert_32f_a16.h \ + volk_16i_s32f_convert_32f_u.h \ + volk_16i_convert_8i_a16.h \ + volk_16i_convert_8i_u.h \ + volk_16i_max_star_16i_a16.h \ + volk_16i_max_star_horizontal_16i_a16.h \ + volk_16i_permute_and_scalar_add_a16.h \ + volk_16i_x4_quad_max_star_16i_a16.h \ volk_16u_byteswap_a16.h \ volk_32f_accumulator_s32f_a16.h \ - volk_32f_32f_add_32f_a16.h \ + volk_32f_x2_add_32f_a16.h \ volk_32fc_32f_multiply_32fc_a16.h \ volk_32fc_32f_power_32fc_a16.h \ volk_32f_calc_spectral_noise_floor_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ - volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \ - volk_32fc_deinterleave_32f_32f_a16.h \ - volk_32fc_deinterleave_64f_64f_a16.h \ - volk_32fc_deinterleave_real_16s_a16.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_deinterleave_32f_x2_a16.h \ + volk_32fc_deinterleave_64f_x2_a16.h \ + volk_32fc_deinterleave_real_16i_a16.h \ volk_32fc_deinterleave_real_32f_a16.h \ volk_32fc_deinterleave_real_64f_a16.h \ - volk_32fc_32fc_dot_prod_32fc_a16.h \ + volk_32fc_x2_dot_prod_32fc_a16.h \ volk_32fc_index_max_16u_a16.h \ - volk_32fc_s32f_magnitude_16s_a16.h \ + volk_32fc_s32f_magnitude_16i_a16.h \ volk_32fc_magnitude_32f_a16.h \ - volk_32fc_32fc_multiply_32fc_a16.h \ - volk_32f_s32f_convert_16s_a16.h \ - volk_32f_s32f_convert_16s_ua16.h \ - volk_32f_s32f_convert_32s_a16.h \ - volk_32f_s32f_convert_32s_ua16.h \ + volk_32fc_x2_multiply_32fc_a16.h \ + volk_32f_s32f_convert_16i_a16.h \ + volk_32f_s32f_convert_16i_u.h \ + volk_32f_s32f_convert_32i_a16.h \ + volk_32f_s32f_convert_32i_u.h \ volk_32f_convert_64f_a16.h \ - volk_32f_convert_64f_ua16.h \ - volk_32f_s32f_convert_8s_a16.h \ - volk_32f_s32f_convert_8s_ua16.h \ - volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \ + volk_32f_convert_64f_u.h \ + volk_32f_s32f_convert_8i_a16.h \ + volk_32f_s32f_convert_8i_u.h \ + volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \ volk_32fc_s32f_power_spectrum_32f_a16.h \ - volk_32fc_32fc_square_dist_32f_a16.h \ - volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \ - volk_32f_32f_divide_32f_a16.h \ - volk_32f_32f_dot_prod_32f_a16.h \ - volk_32f_32f_dot_prod_32f_ua16.h \ + volk_32fc_x2_square_dist_32f_a16.h \ + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \ + volk_32f_x2_divide_32f_a16.h \ + volk_32f_x2_dot_prod_32f_a16.h \ + volk_32f_x2_dot_prod_32f_u.h \ volk_32f_s32f_32f_fm_detect_32f_a16.h \ volk_32f_index_max_16u_a16.h \ - volk_32f_32f_s32f_interleave_16sc_a16.h \ - volk_32f_32f_interleave_32fc_a16.h \ - volk_32f_32f_max_32f_a16.h \ - volk_32f_32f_min_32f_a16.h \ - volk_32f_32f_multiply_32f_a16.h \ + volk_32f_x2_s32f_interleave_16ic_a16.h \ + volk_32f_x2_interleave_32fc_a16.h \ + volk_32f_x2_max_32f_a16.h \ + volk_32f_x2_min_32f_a16.h \ + volk_32f_x2_multiply_32f_a16.h \ volk_32f_s32f_normalize_a16.h \ volk_32f_s32f_power_32f_a16.h \ volk_32f_sqrt_32f_a16.h \ volk_32f_s32f_stddev_32f_a16.h \ - volk_32f_stddev_and_mean_32f_32f_a16.h \ - volk_32f_32f_subtract_32f_a16.h \ - volk_32f_32f_32f_sum_of_poly_32f_a16.h \ - volk_32s_32s_and_32s_a16.h \ - volk_32s_s32f_convert_32f_a16.h \ - volk_32s_s32f_convert_32f_ua16.h \ - volk_32s_32s_or_32s_a16.h \ + volk_32f_stddev_and_mean_32f_x2_a16.h \ + volk_32f_x2_subtract_32f_a16.h \ + volk_32f_x3_sum_of_poly_32f_a16.h \ + volk_32i_x2_and_32i_a16.h \ + volk_32i_s32f_convert_32f_a16.h \ + volk_32i_s32f_convert_32f_u.h \ + volk_32i_x2_or_32i_a16.h \ volk_32u_byteswap_a16.h \ volk_32u_popcnt_a16.h \ volk_64f_convert_32f_a16.h \ - volk_64f_convert_32f_ua16.h \ - volk_64f_64f_max_64f_a16.h \ - volk_64f_64f_min_64f_a16.h \ + volk_64f_convert_32f_u.h \ + volk_64f_x2_max_64f_a16.h \ + volk_64f_x2_min_64f_a16.h \ volk_64u_byteswap_a16.h \ volk_64u_popcnt_a16.h \ - volk_8sc_deinterleave_16s_16s_a16.h \ - volk_8sc_s32f_deinterleave_32f_32f_a16.h \ - volk_8sc_deinterleave_real_16s_a16.h \ - volk_8sc_s32f_deinterleave_real_32f_a16.h \ - volk_8sc_deinterleave_real_8s_a16.h \ - volk_8sc_8sc_multiply_conjugate_16sc_a16.h \ - volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \ - volk_8s_convert_16s_a16.h \ - volk_8s_convert_16s_ua16.h \ - volk_8s_s32f_convert_32f_a16.h \ - volk_8s_s32f_convert_32f_ua16.h + volk_8ic_deinterleave_16i_x2_a16.h \ + volk_8ic_s32f_deinterleave_32f_x2_a16.h \ + volk_8ic_deinterleave_real_16i_a16.h \ + volk_8ic_s32f_deinterleave_real_32f_a16.h \ + volk_8ic_deinterleave_real_8i_a16.h \ + volk_8ic_x2_multiply_conjugate_16ic_a16.h \ + volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \ + volk_8i_convert_16i_a16.h \ + volk_8i_convert_16i_u.h \ + volk_8i_s32f_convert_32f_a16.h \ + volk_8i_s32f_convert_32f_u.h VOLK_MKTABLES_SOURCES = \ $(top_srcdir)/lib/volk_rank_archs.c \ diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f708ba7d0..6e75067d0 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -24,7 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = tempstring + " }\n" tempstring = tempstring + " return 0;\n" tempstring = tempstring + "}\n" - + for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h new file mode 100644 index 000000000..3437c1a6b --- /dev/null +++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H +#define INCLUDED_volk_16i_branch_4_state_8_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h new file mode 100644 index 000000000..73e45ad63 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_a16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16i_convert_8i_a16_H +#define INCLUDED_volk_16i_convert_8i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h new file mode 100644 index 000000000..5fc792b56 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_16i_convert_8i_u_H +#define INCLUDED_volk_16i_convert_8i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_u_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h new file mode 100644 index 000000000..ff57bd2a1 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_16i_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16i_max_star_16i_a16_H +#define INCLUDED_volk_16i_max_star_16i_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h new file mode 100644 index 000000000..695e08dbf --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h new file mode 100644 index 000000000..e52a949fb --- /dev/null +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..83fd26ff9 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H +#define INCLUDED_volk_16i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h new file mode 100644 index 000000000..8f0dd0083 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H +#define INCLUDED_volk_16i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h new file mode 100644 index 000000000..e4ec5ab4e --- /dev/null +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + + "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h new file mode 100644 index 000000000..5744ca3a6 --- /dev/null +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h new file mode 100644 index 000000000..7e08bf182 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..388c00592 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h new file mode 100644 index 000000000..437d5ab6b --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h new file mode 100644 index 000000000..bdcace750 --- /dev/null +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -0,0 +1,190 @@ +#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H +#define INCLUDED_volk_16ic_magnitude_16i_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..606de2fc5 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..62331e496 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h new file mode 100644 index 000000000..ae64efbeb --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h deleted file mode 100644 index 67d0c55a3..000000000 --- a/volk/include/volk/volk_16s_add_quad_a16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_volk_16s_add_quad_a16_H -#define INCLUDED_volk_16s_add_quad_a16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 -#include -#include - -static inline void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16s_add_quad_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16s_add_quad_a16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t" - ".%=volk_16s_add_quad_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC - -static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_volk_16s_add_quad_a16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h deleted file mode 100644 index 4c1af8729..000000000 --- a/volk/include/volk/volk_16s_branch_4_state_8_a16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H -#define INCLUDED_volk_16s_branch_4_state_8_a16_H - - -#include -#include - - - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#if LV_HAVE_GENERIC -static inline void volk_16s_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h deleted file mode 100644 index 13db435de..000000000 --- a/volk/include/volk/volk_16s_convert_8s_a16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16s_convert_8s_a16_H -#define INCLUDED_volk_16s_convert_8s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h deleted file mode 100644 index 9941118ae..000000000 --- a/volk/include/volk/volk_16s_convert_8s_ua16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_16s_convert_8s_ua16_H -#define INCLUDED_volk_16s_convert_8s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h deleted file mode 100644 index b2ec90552..000000000 --- a/volk/include/volk/volk_16s_max_star_16s_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16s_max_star_16s_a16_H -#define INCLUDED_volk_16s_max_star_16s_a16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h deleted file mode 100644 index 68994593b..000000000 --- a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H -#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h deleted file mode 100644 index 2e7586b57..000000000 --- a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H - - -#include -#include - - - - -#if LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16s_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h deleted file mode 100644 index 3e89ff963..000000000 --- a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H -#define INCLUDED_volk_16s_quad_max_star_16s_a16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 - -#include - -static inline void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t" - - "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h deleted file mode 100644 index 8f9b44478..000000000 --- a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H -#define INCLUDED_volk_16s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h deleted file mode 100644 index ad52aea1a..000000000 --- a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h deleted file mode 100644 index 8e5da24ec..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H -#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h deleted file mode 100644 index 068c1350c..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h deleted file mode 100644 index afa21ebc4..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H -#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h deleted file mode 100644 index d832de5fe..000000000 --- a/volk/include/volk/volk_16sc_magnitude_16s_a16.h +++ /dev/null @@ -1,190 +0,0 @@ -#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H -#define INCLUDED_volk_16sc_magnitude_16s_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h deleted file mode 100644 index 53e4253c4..000000000 --- a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 7320db368..000000000 --- a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h deleted file mode 100644 index 649b5cc96..000000000 --- a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h deleted file mode 100644 index a0f97f94e..000000000 --- a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h deleted file mode 100644 index ba38c310f..000000000 --- a/volk/include/volk/volk_32f_32f_add_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H -#define INCLUDED_volk_32f_32f_add_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h deleted file mode 100644 index a0995e631..000000000 --- a/volk/include/volk/volk_32f_32f_divide_32f_a16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H -#define INCLUDED_volk_32f_32f_divide_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h deleted file mode 100644 index 63f5221d3..000000000 --- a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h deleted file mode 100644 index b5fa7d7a4..000000000 --- a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H -#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/ diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h deleted file mode 100644 index 34ea93349..000000000 --- a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H -#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h deleted file mode 100644 index 8ca7a5ba8..000000000 --- a/volk/include/volk/volk_32f_32f_max_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H -#define INCLUDED_volk_32f_32f_max_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h deleted file mode 100644 index dd05988be..000000000 --- a/volk/include/volk/volk_32f_32f_min_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H -#define INCLUDED_volk_32f_32f_min_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h deleted file mode 100644 index 2d004db10..000000000 --- a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H -#define INCLUDED_volk_32f_32f_multiply_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h deleted file mode 100644 index 207382a19..000000000 --- a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H -#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h deleted file mode 100644 index 9fea6aa27..000000000 --- a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H -#define INCLUDED_volk_32f_32f_subtract_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h new file mode 100644 index 000000000..a825767de --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_u_H +#define INCLUDED_volk_32f_convert_64f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_u_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h deleted file mode 100644 index c8de768dc..000000000 --- a/volk/include/volk/volk_32f_convert_64f_ua16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_ua16_H -#define INCLUDED_volk_32f_convert_64f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h new file mode 100644 index 000000000..d6b16e336 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H +#define INCLUDED_volk_32f_s32f_convert_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h new file mode 100644 index 000000000..4d306e53c --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H +#define INCLUDED_volk_32f_s32f_convert_16i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h deleted file mode 100644 index cf51cf9c5..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H -#define INCLUDED_volk_32f_s32f_convert_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h deleted file mode 100644 index 53d159f82..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h new file mode 100644 index 000000000..ae874fd7b --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H +#define INCLUDED_volk_32f_s32f_convert_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h new file mode 100644 index 000000000..561fcd800 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H +#define INCLUDED_volk_32f_s32f_convert_32i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h deleted file mode 100644 index 0be649418..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H -#define INCLUDED_volk_32f_s32f_convert_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h deleted file mode 100644 index efb2c3a20..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h new file mode 100644 index 000000000..c91448951 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H +#define INCLUDED_volk_32f_s32f_convert_8i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h new file mode 100644 index 000000000..420693571 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H +#define INCLUDED_volk_32f_s32f_convert_8i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h deleted file mode 100644 index 69ccec5c6..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H -#define INCLUDED_volk_32f_s32f_convert_8s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h deleted file mode 100644 index af1652b19..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h deleted file mode 100644 index 2ba809845..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h new file mode 100644 index 000000000..278089841 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h new file mode 100644 index 000000000..d0d0e0a0e --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H +#define INCLUDED_volk_32f_x2_add_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h new file mode 100644 index 000000000..d844e25b0 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H +#define INCLUDED_volk_32f_x2_divide_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h new file mode 100644 index 000000000..61aa56815 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h new file mode 100644 index 000000000..8469a3cea --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h new file mode 100644 index 000000000..29c9392df --- /dev/null +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h new file mode 100644 index 000000000..26e7f1246 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H +#define INCLUDED_volk_32f_x2_max_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h new file mode 100644 index 000000000..23bae044c --- /dev/null +++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H +#define INCLUDED_volk_32f_x2_min_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h new file mode 100644 index 000000000..a0dcfa86e --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H +#define INCLUDED_volk_32f_x2_multiply_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h new file mode 100644 index 000000000..30306774d --- /dev/null +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h new file mode 100644 index 000000000..7404bfe79 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H +#define INCLUDED_volk_32f_x2_subtract_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h new file mode 100644 index 000000000..af9e39537 --- /dev/null +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h deleted file mode 100644 index cd9cc8160..000000000 --- a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h deleted file mode 100644 index 2ccfcf2f2..000000000 --- a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h +++ /dev/null @@ -1,468 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H - -#include -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h deleted file mode 100644 index 59259882c..000000000 --- a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); -static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h deleted file mode 100644 index 14f511697..000000000 --- a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h deleted file mode 100644 index b6c72adbf..000000000 --- a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h deleted file mode 100644 index 3ee579c2e..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..84d2576ed --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h deleted file mode 100644 index 404defc36..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h new file mode 100644 index 000000000..34262a7af --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..6042e6d62 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h deleted file mode 100644 index 53235e5f7..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h new file mode 100644 index 000000000..530359600 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h deleted file mode 100644 index dc3c6741a..000000000 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h deleted file mode 100644 index 29ccdaef7..000000000 --- a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h new file mode 100644 index 000000000..0120b5307 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h new file mode 100644 index 000000000..a01971df3 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h new file mode 100644 index 000000000..9a7b65ab4 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h new file mode 100644 index 000000000..224ab19c8 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h new file mode 100644 index 000000000..6a863b16d --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h new file mode 100644 index 000000000..406097fc8 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..0fcadd9cb --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H +#define INCLUDED_volk_32i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h new file mode 100644 index 000000000..1dd6422f8 --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H +#define INCLUDED_volk_32i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h new file mode 100644 index 000000000..3baa1d856 --- /dev/null +++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H +#define INCLUDED_volk_32i_x2_and_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h new file mode 100644 index 000000000..0be22f00a --- /dev/null +++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H +#define INCLUDED_volk_32i_x2_or_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h deleted file mode 100644 index 0e8380757..000000000 --- a/volk/include/volk/volk_32s_32s_and_32s_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H -#define INCLUDED_volk_32s_32s_and_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h deleted file mode 100644 index 2dcf2e551..000000000 --- a/volk/include/volk/volk_32s_32s_or_32s_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H -#define INCLUDED_volk_32s_32s_or_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h deleted file mode 100644 index c16ecc9dd..000000000 --- a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H -#define INCLUDED_volk_32s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h deleted file mode 100644 index 4eb5a5b85..000000000 --- a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h deleted file mode 100644 index 7e091851f..000000000 --- a/volk/include/volk/volk_64f_64f_max_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H -#define INCLUDED_volk_64f_64f_max_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h deleted file mode 100644 index f2bcbe83b..000000000 --- a/volk/include/volk/volk_64f_64f_min_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H -#define INCLUDED_volk_64f_64f_min_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h new file mode 100644 index 000000000..6338c1433 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_u_H +#define INCLUDED_volk_64f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h deleted file mode 100644 index 7774db1b7..000000000 --- a/volk/include/volk/volk_64f_convert_32f_ua16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_ua16_H -#define INCLUDED_volk_64f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h new file mode 100644 index 000000000..4b0c1f5f1 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H +#define INCLUDED_volk_64f_x2_max_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h new file mode 100644 index 000000000..aa961e384 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H +#define INCLUDED_volk_64f_x2_min_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h new file mode 100644 index 000000000..3d7045753 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_a16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8i_convert_16i_a16_H +#define INCLUDED_volk_8i_convert_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h new file mode 100644 index 000000000..bcff13406 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_u.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_8i_convert_16i_u_H +#define INCLUDED_volk_8i_convert_16i_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..d5c8eeb51 --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -0,0 +1,105 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H +#define INCLUDED_volk_8i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h new file mode 100644 index 000000000..1e30957e8 --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H +#define INCLUDED_volk_8i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h new file mode 100644 index 000000000..91c9b2c58 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..bf3dc20dd --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h new file mode 100644 index 000000000..13de79423 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..22c3ebb23 --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..5f1430394 --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h new file mode 100644 index 000000000..d9cacbf46 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h new file mode 100644 index 000000000..6ec923a4f --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h deleted file mode 100644 index 38efdb6a3..000000000 --- a/volk/include/volk/volk_8s_convert_16s_a16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_volk_8s_convert_16s_a16_H -#define INCLUDED_volk_8s_convert_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h deleted file mode 100644 index a726bfb5e..000000000 --- a/volk/include/volk/volk_8s_convert_16s_ua16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_8s_convert_16s_ua16_H -#define INCLUDED_volk_8s_convert_16s_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h deleted file mode 100644 index 45185ac2e..000000000 --- a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H -#define INCLUDED_volk_8s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h deleted file mode 100644 index 310824580..000000000 --- a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h deleted file mode 100644 index eae1185ec..000000000 --- a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H -#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */ diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h deleted file mode 100644 index 621276b08..000000000 --- a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h deleted file mode 100644 index 6a35e969d..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H -#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h deleted file mode 100644 index 67ffebd99..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h deleted file mode 100644 index ecffc092e..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h deleted file mode 100644 index cedbf202c..000000000 --- a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 902795131..000000000 --- a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index fc1ec10ef..bc8f959af 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -55,7 +55,7 @@ functions = []; for line in mfile: - subline = re.search(".*(a16).*", line); + subline = re.search(".*_(a16|u)\.h.*", line); if subline: subsubline = re.search("(?<=volk_).*", subline.group(0)); if subsubline: @@ -70,7 +70,7 @@ datatypes = set(datatypes); for line in mfile: for dt in datatypes: if dt in line: - subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line); + subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line); if subline: subsubline = re.search(".+(?=\.h)", subline.group(0)); -- cgit From 82cafc4381e48ccc9423d2dc88720e5c1347d940 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 12:26:52 -0800 Subject: Volk: fixed naming error. test coverage @ 75%, still need to add support for multiple outputs in the checker. some errors in the library were exposed by the new test suite, and a couple of bad Orc functions. need to investigate. --- volk/include/volk/Makefile.am | 2 +- .../volk/volk_16ic_deinterleave_real_8i_a16.h | 6 +- .../volk/volk_32fc_deinterleave_real_16i_a16.h | 80 ---------------------- .../volk_32fc_s32f_deinterleave_real_16i_a16.h | 80 ++++++++++++++++++++++ 4 files changed, 84 insertions(+), 84 deletions(-) delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 83f386c6c..79da0effb 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -68,7 +68,7 @@ volkinclude_HEADERS = \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ volk_32fc_deinterleave_32f_x2_a16.h \ volk_32fc_deinterleave_64f_x2_a16.h \ - volk_32fc_deinterleave_real_16i_a16.h \ + volk_32fc_s32f_deinterleave_real_16i_a16.h \ volk_32fc_deinterleave_real_32f_a16.h \ volk_32fc_deinterleave_real_64f_a16.h \ volk_32fc_x2_dot_prod_32fc_a16.h \ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index 437d5ab6b..55a25702e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con number = sixteenthPoints * 16; int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); int16ComplexVectorPtr++; } } @@ -68,10 +68,10 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con */ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); complexVectorPtr++; } } diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h deleted file mode 100644 index 6042e6d62..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..31465bff9 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ -- cgit From 7a5a751073cc1583533b84c90ecc985b3669a696 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 15:14:26 -0800 Subject: Volk: added QA support for multiple outputs, scalar outputs. 92% test coverage within the framework. --- volk/include/volk/Makefile.am | 2 +- .../volk/volk_32f_calc_spectral_noise_floor_a16.h | 167 --------------------- ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 167 +++++++++++++++++++++ 3 files changed, 168 insertions(+), 168 deletions(-) delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 79da0effb..1eb46b602 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -63,7 +63,7 @@ volkinclude_HEADERS = \ volk_32f_x2_add_32f_a16.h \ volk_32fc_32f_multiply_32fc_a16.h \ volk_32fc_32f_power_32fc_a16.h \ - volk_32f_calc_spectral_noise_floor_a16.h \ + volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ volk_32fc_deinterleave_32f_x2_a16.h \ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h deleted file mode 100644 index fce77cd04..000000000 --- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h +++ /dev/null @@ -1,167 +0,0 @@ -#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H -#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h new file mode 100644 index 000000000..168245d65 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ -- cgit From fa8c8c8e9fcd74eda5edb58edc89be97bc4bfa0a Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 15:29:08 -0800 Subject: Volk: added ability to spec scalar in test invocation --- volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index 55a25702e..cfbebd57b 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con number = sixteenthPoints * 16; int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); int16ComplexVectorPtr++; } } @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256)); complexVectorPtr++; } } -- cgit From f832c9789be9fec46e211be4fb2355013d19c000 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 18:24:02 -0800 Subject: Volk: Small changes to speed things up. --- volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index cfbebd57b..55a25702e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con number = sixteenthPoints * 16; int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); int16ComplexVectorPtr++; } } @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256)); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); complexVectorPtr++; } } diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index c91448951..f64f2a213 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, co unsigned int number = 0; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); } } #endif /* LV_HAVE_GENERIC */ -- cgit From 81c3086bee1752c94a89ab2d20b7de048fdd1be7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 19:58:11 -0500 Subject: Cleans up the Makefiles for the various platforms. This should also make it easier to add new architectures. Thanks to Josh for the inspiration. --- volk/include/volk/Makefile.am | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 04a43bd34..658974d3a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2010 Free Software Foundation, Inc. +# Copyright 2010,2011 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -130,33 +130,11 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_unaligned16.h VOLK_MKTABLES_SOURCES = \ + $(platform_CODE) \ $(top_srcdir)/lib/volk_rank_archs.c \ $(top_srcdir)/lib/volk_mktables.c -# FIXME: Not very extensible to supporting more processors easily -if MYSUBCPU_X86_64 - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86_64.S -endif - -if MYSUBCPU_X86 - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86.S -endif - -if MYSUBCPU_POWERPC - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_powerpc.c -endif - -if MYSUBCPU_GENERIC - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_generic.c -endif - volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include -- cgit From 74e9f2b0bce026b7b68a2d0a3558687386fdaab5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 23:18:07 -0500 Subject: Creates volk_cpu in a .cc file and externs it in the header. --- volk/include/volk/make_cpuid_h.py | 25 ++++++++++++++++++++++++- volk/include/volk/make_proccpu_sim.py | 23 +++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py index 823e3b2c0..cd3da2455 100644 --- a/volk/include/volk/make_cpuid_h.py +++ b/volk/include/volk/make_cpuid_h.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom from emit_omnilog import * @@ -13,7 +35,8 @@ def make_cpuid_h(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "}volk_cpu;\n\n"; + tempstring = tempstring + "};\n\n"; + tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; tempstring = tempstring + "void volk_cpu_init ();\n" tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py index c75a4d5fb..e8463f5be 100644 --- a/volk/include/volk/make_proccpu_sim.py +++ b/volk/include/volk/make_proccpu_sim.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_proccpu_sim(dom) : @@ -6,6 +28,7 @@ def make_proccpu_sim(dom) : tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n" tempstring = tempstring + "\n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; tempstring = tempstring + " if(val==1){\n"; -- cgit From 568d0cafb8423f34530c94963654abea4c62801b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 23:18:07 -0500 Subject: A fix for volk_cpu being a duplicate variable. Declared as extern in the header file volk_cpu.h and actually created in the C files. Note that this could be a problem if multiple architectures are ever (can ever?) be built at the same time. If that happens, we can move this variable declaration to another C file that is made common to all builds. --- volk/include/volk/make_cpuid_generic_c.py | 26 ++++++++++++++++++++++++-- volk/include/volk/make_cpuid_h.py | 25 ++++++++++++++++++++++++- volk/include/volk/make_cpuid_powerpc_c.py | 26 ++++++++++++++++++++++++-- volk/include/volk/make_cpuid_x86_c.py | 27 +++++++++++++++++++++++++-- 4 files changed, 97 insertions(+), 7 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_cpuid_generic_c.py b/volk/include/volk/make_cpuid_generic_c.py index 3ba225fca..c682d4138 100644 --- a/volk/include/volk/make_cpuid_generic_c.py +++ b/volk/include/volk/make_cpuid_generic_c.py @@ -1,11 +1,33 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_generic_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" for domarch in dom: if str(domarch.attributes["type"].value) == "all": diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py index 823e3b2c0..cd3da2455 100644 --- a/volk/include/volk/make_cpuid_h.py +++ b/volk/include/volk/make_cpuid_h.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom from emit_omnilog import * @@ -13,7 +35,8 @@ def make_cpuid_h(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "}volk_cpu;\n\n"; + tempstring = tempstring + "};\n\n"; + tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; tempstring = tempstring + "void volk_cpu_init ();\n" tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py index 443a58488..0b0ea84e7 100644 --- a/volk/include/volk/make_cpuid_powerpc_c.py +++ b/volk/include/volk/make_cpuid_powerpc_c.py @@ -1,11 +1,33 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_powerpc_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" #just assume it has them for powerpc for domarch in dom: diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index 8ebe243e5..48a406fa4 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -1,11 +1,34 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_x86_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" + tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; tempstring = tempstring + " unsigned int regs[4];\n" -- cgit From 2e8610450bc467400f8de2976cc6c1743025aff1 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 15 Jan 2011 10:53:54 -0500 Subject: This should not have been put here. --- volk/include/volk/make_proccpu_sim.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py index e8463f5be..029dacfcc 100644 --- a/volk/include/volk/make_proccpu_sim.py +++ b/volk/include/volk/make_proccpu_sim.py @@ -28,7 +28,6 @@ def make_proccpu_sim(dom) : tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n" tempstring = tempstring + "\n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; tempstring = tempstring + " if(val==1){\n"; -- cgit From 9edf280fb25bee071c166123ac1aee41d4a4949e Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sat, 15 Jan 2011 17:29:15 -0800 Subject: volk: replace assembly and separate cases with gcc cpuid for all x86 --- volk/include/volk/make_cpuid_x86_c.py | 3 ++- volk/include/volk/make_set_simd.py | 33 +++++++++------------------------ 2 files changed, 11 insertions(+), 25 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index 48a406fa4..2b2bd7c91 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -27,9 +27,10 @@ def make_cpuid_x86_c(dom) : tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "#include \n\n" tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" + tempstring = tempstring + "#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3)\n\n" tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; tempstring = tempstring + " unsigned int regs[4];\n" tempstring = tempstring + " cpuid_x86 (op, regs);\n" diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 842366b18..275d3869f 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -27,30 +27,15 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; tempstring = tempstring + " case \"$MD_CPU\" in\n"; tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; - tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I $srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " (*)\n" - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " esac\n" + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (powerpc)\n"; tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; -- cgit From 108a594c0838ad21f93cba6597d1f66af097b157 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 25 Jan 2011 10:37:49 -0500 Subject: volk: New volk kernel for conjugate dot products with unaligned buffers. Note: need to convert this to new naming standard. --- volk/include/volk/Makefile.am | 3 +- .../volk/volk_32fc_conjugate_dot_prod_unaligned.h | 144 +++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 658974d3a..00289be1e 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -66,6 +66,7 @@ volkinclude_HEADERS = \ volk_32f_calc_spectral_noise_floor_aligned16.h \ volk_32fc_atan2_32f_aligned16.h \ volk_32fc_conjugate_dot_prod_aligned16.h \ + volk_32fc_conjugate_dot_prod_unaligned.h \ volk_32fc_deinterleave_32f_aligned16.h \ volk_32fc_deinterleave_64f_aligned16.h \ volk_32fc_deinterleave_real_16s_aligned16.h \ @@ -157,4 +158,4 @@ distclean-local: rm -f Makefile.in rm -f volk_environment_init.h rm -f volk_mktables - rm -f $(BUILT_SOURCES) \ No newline at end of file + rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h new file mode 100644 index 000000000..ead1573fe --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H +#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H + + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_conjugate_dot_prod_unaligned_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE3 + +#include +#include +#include + + +static inline void volk_32fc_conjugate_dot_prod_unaligned_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = in1*in2; + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = in1*fehg; + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(in1*in2, halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_32fc_init(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H*/ + + + -- cgit From f47466012c733526e04f8ceeb66a3677eec38cd9 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 25 Jan 2011 18:20:36 -0800 Subject: Volk: Orc impl for complex multiply fixed. Maybe some room for optimization. --- volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h index 224ab19c8..b4214f5d2 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -81,10 +81,9 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -- cgit From 2a4c4f89187bf75caa34c7bc52fc32310a75c9f2 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 15:28:35 -0800 Subject: Volk: fixed volk_8i_s32f_convert_32f_a16_orc_impl. --- volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h index d5c8eeb51..99a24ec10 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -96,7 +96,8 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con */ extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); + float invscalar = 1.0 / scalar; + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); } #endif /* LV_HAVE_ORC */ -- cgit From e34a484084a5224ec3412bd7d6c6f285301f5d43 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 15:47:56 -0800 Subject: Volk: renamed volk_32fc_32f_power_32fc_a16 to volk_32fc_s32f_power_32fc_a16 --- volk/include/volk/Makefile.am | 2 +- volk/include/volk/volk_32fc_32f_power_32fc_a16.h | 109 ---------------------- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 109 ++++++++++++++++++++++ 3 files changed, 110 insertions(+), 110 deletions(-) delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 1eb46b602..eb97775b0 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -62,7 +62,7 @@ volkinclude_HEADERS = \ volk_32f_accumulator_s32f_a16.h \ volk_32f_x2_add_32f_a16.h \ volk_32fc_32f_multiply_32fc_a16.h \ - volk_32fc_32f_power_32fc_a16.h \ + volk_32fc_s32f_power_32fc_a16.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h deleted file mode 100644 index 6f9e9e3ee..000000000 --- a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_32f_power_32fc_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; - for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h new file mode 100644 index 000000000..3507fdb3c --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ -- cgit From 6503e3b21978b71908400c994148836bec4a97b9 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 30 Jan 2011 12:35:07 -0500 Subject: volk: Updating build structure to work when orc is not installed. Distcheck passes for me if liborc is installed or not. --- volk/include/volk/make_set_simd.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index f2b7c0656..019833b43 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -280,7 +280,6 @@ def make_set_simd(dom) : tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" - tempstring = tempstring + " AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n"; tempstring = tempstring + "])\n" return tempstring; -- cgit From b806f6e95cd917e54884841c8e7928204ecd78f8 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 2 Feb 2011 14:21:46 -0500 Subject: volk: updating to readd unaligned dot product under new name scheme. --- volk/include/volk/Makefile.am | 1 + .../volk/volk_32fc_conjugate_dot_prod_unaligned.h | 144 --------------------- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 144 +++++++++++++++++++++ 3 files changed, 145 insertions(+), 144 deletions(-) delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index eb97775b0..7a5edd624 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -66,6 +66,7 @@ volkinclude_HEADERS = \ volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_u.h \ volk_32fc_deinterleave_32f_x2_a16.h \ volk_32fc_deinterleave_64f_x2_a16.h \ volk_32fc_s32f_deinterleave_real_16i_a16.h \ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h deleted file mode 100644 index ead1573fe..000000000 --- a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H -#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H - - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_conjugate_dot_prod_unaligned_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE3 - -#include -#include -#include - - -static inline void volk_32fc_conjugate_dot_prod_unaligned_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - union HalfMask { - uint32_t intRep[4]; - __m128 vec; - } halfMask; - - union NegMask { - int intRep[4]; - __m128 vec; - } negMask; - - unsigned int offset = 0; - float Rsum=0, Isum=0; - float Im,Re; - - __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; - __m128 zv = {0,0,0,0}; - - halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; - halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; - - negMask.intRep[0] = negMask.intRep[2] = 0x80000000; - negMask.intRep[1] = negMask.intRep[3] = 0; - - // main loop - while(num_bytes >= 4*sizeof(float)){ - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = in1*in2; - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = in1*fehg; - Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - num_bytes -= 4*sizeof(float); - offset += 2; - Rsum += Re; - Isum += Im; - } - - // handle the last complex case ... - if(num_bytes > 0){ - - if(num_bytes != 4){ - // bad things are happening - } - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(in1*in2, halfMask.vec); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(in1*fehg, halfMask.vec); - Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - Rsum += Re; - Isum += Im; - } - - result[0] = lv_32fc_init(Rsum,Isum); - return; -} - -#endif /*LV_HAVE_SSE3*/ - - -#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H*/ - - - diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h new file mode 100644 index 000000000..2fa5918cc --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H + + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE3 + +#include +#include +#include + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = in1*in2; + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = in1*fehg; + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(in1*in2, halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_32fc_init(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ + + + -- cgit From 1633e9371a0bce876757f1c2c3e4054436b57950 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 18:48:56 -0800 Subject: cpuid: No more compile-time CPU checks. Compiles everything that gcc allows. Configure- and compile-time checks modified to only compile architectures that the compiler will handle. This means that volk will compile every arch that your gcc will compile, no matter if your CPU can execute the instructions or not. This lets you cross-compile volk. volk_rank_archs will be deferred to runtime. This has not been done yet. --- volk/include/volk/make_cpuid_c.py | 184 ++++++++++++++++++++++ volk/include/volk/make_cpuid_generic_c.py | 60 -------- volk/include/volk/make_cpuid_powerpc_c.py | 67 -------- volk/include/volk/make_cpuid_x86_c.py | 133 ---------------- volk/include/volk/make_mktables.py | 2 +- volk/include/volk/make_set_simd.py | 243 +++++++++--------------------- volk/include/volk/volk_register.py | 23 +-- 7 files changed, 257 insertions(+), 455 deletions(-) create mode 100644 volk/include/volk/make_cpuid_c.py delete mode 100644 volk/include/volk/make_cpuid_generic_c.py delete mode 100644 volk/include/volk/make_cpuid_powerpc_c.py delete mode 100644 volk/include/volk/make_cpuid_x86_c.py (limited to 'volk/include') diff --git a/volk/include/volk/make_cpuid_c.py b/volk/include/volk/make_cpuid_c.py new file mode 100644 index 000000000..20621769b --- /dev/null +++ b/volk/include/volk/make_cpuid_c.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from xml.dom import minidom + +HEADER_TEMPL = """\ +/*this file is auto_generated by volk_register.py*/ + +#include +#include + +struct VOLK_CPU volk_cpu; + +#if defined(__i386__) || (__x86_64__) +#include +#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3) + +static inline unsigned int cpuid_eax(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[0]; +} + +static inline unsigned int cpuid_ebx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[1]; +} + +static inline unsigned int cpuid_ecx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[2]; +} + +static inline unsigned int cpuid_edx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[3]; +} +#endif + +""" + +def make_cpuid_c(dom) : + tempstring = HEADER_TEMPL; + + for domarch in dom: + if str(domarch.attributes["type"].value) == "x86": + if "no_test" in domarch.attributes.keys(): + no_test = str(domarch.attributes["no_test"].value); + if no_test == "true": + no_test = True; + else: + no_test = False; + else: + no_test = False; + arch = str(domarch.attributes["name"].value); + op = domarch.getElementsByTagName("op"); + if op: + op = str(op[0].firstChild.data); + reg = domarch.getElementsByTagName("reg"); + if reg: + reg = str(reg[0].firstChild.data); + shift = domarch.getElementsByTagName("shift"); + if shift: + shift = str(shift[0].firstChild.data); + val = domarch.getElementsByTagName("val"); + if val: + val = str(val[0].firstChild.data); + + if no_test: + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + return 1; +#else + return 0; +#endif +} + +""" % (arch) + + elif op == "1": + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + unsigned int e%sx = cpuid_e%sx (%s); + return ((e%sx >> %s) & 1) == %s; +#else + return 0; +#endif +} + +""" % (arch, reg, reg, op, reg, shift, val) + + elif op == "0x80000001": + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + unsigned int extended_fct_count = cpuid_eax(0x80000000); + if (extended_fct_count < 0x80000001) + return %s^1; + unsigned int extended_features = cpuid_e%sx (%s); + return ((extended_features >> %s) & 1) == %s; +#else + return 0; +#endif +} + +""" % (arch, val, reg, op, shift, val) + + elif str(domarch.attributes["type"].value) == "powerpc": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { +#ifdef __PPC__ + return 1; +#else + return 0; +#endif +} + +""" % (arch) + + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { + return 1; +} + +""" % (arch) + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { + return 0; +} + +""" % (arch) + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + + + + + + + diff --git a/volk/include/volk/make_cpuid_generic_c.py b/volk/include/volk/make_cpuid_generic_c.py deleted file mode 100644 index c682d4138..000000000 --- a/volk/include/volk/make_cpuid_generic_c.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_generic_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - for domarch in dom: - if str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py deleted file mode 100644 index 0b0ea84e7..000000000 --- a/volk/include/volk/make_cpuid_powerpc_c.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_powerpc_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - #just assume it has them for powerpc - for domarch in dom: - if str(domarch.attributes["type"].value) == "powerpc": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - - tempstring = tempstring + "}\n\n" - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py deleted file mode 100644 index 2b2bd7c91..000000000 --- a/volk/include/volk/make_cpuid_x86_c.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_x86_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - tempstring = tempstring + "#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3)\n\n" - tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[0];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[1];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[2];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[3];\n" - tempstring = tempstring + "}\n\n"; - - for domarch in dom: - if str(domarch.attributes["type"].value) == "x86": - if "no_test" in domarch.attributes.keys(): - no_test = str(domarch.attributes["no_test"].value); - if no_test == "true": - no_test = True; - else: - no_test = False; - else: - no_test = False; - arch = str(domarch.attributes["name"].value); - op = domarch.getElementsByTagName("op"); - if op: - op = str(op[0].firstChild.data); - reg = domarch.getElementsByTagName("reg"); - if reg: - reg = str(reg[0].firstChild.data); - shift = domarch.getElementsByTagName("shift"); - if shift: - shift = str(shift[0].firstChild.data); - val = domarch.getElementsByTagName("val"); - if val: - val = str(val[0].firstChild.data); - - if no_test: - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - elif op == "1": - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" - tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" - tempstring = tempstring + "}\n\n"; - - elif op == "0x80000001": - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " unsigned int extended_fct_count = cpuid_eax(0x80000000);\n"; - tempstring = tempstring + " if (extended_fct_count < 0x80000001)\n"; - tempstring = tempstring + " return "+ val + "^1;\n\n" - tempstring = tempstring + " unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n"; - tempstring = tempstring + " return ((extended_features >> " + shift + ") & 1) == " + val + ";\n" - tempstring = tempstring + "}\n\n"; - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - - - - - - - diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py index 051ac268d..25da086fa 100644 --- a/volk/include/volk/make_mktables.py +++ b/volk/include/volk/make_mktables.py @@ -18,7 +18,7 @@ def make_mktables(funclist) : tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; tempstring = tempstring + ' fclose(output);\n' tempstring = tempstring + '}\n'; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index c74b0464d..8aef5bb16 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -15,53 +15,23 @@ # along with this program. If not, see . # +#TODO: add in checking for 32-bit and 64-bit compiler support, to pick +#which version to use. + +#for a given compiler: first try 64 bit, then fall back to 32 bit. +#to try 64 bit, just try compiling a program with -m64 +#if it passes, whee you're 64-bit +#if not, 32 +#this should probably be elsewhere and just referred in here +#actually, why the fuck does LF_CHECK_CC_FLAG([-m64]) work at all? it doesn't even COMPILE + from xml.dom import minidom def make_set_simd(dom) : tempstring = ""; tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; - tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n"; - tempstring = tempstring + "[\n"; - tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; - tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; - tempstring = tempstring + " case \"$MD_CPU\" in\n"; - tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " (powerpc)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " (*)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_generic.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " esac\n"; - tempstring = tempstring + "])\n" - + tempstring = tempstring +'\ndnl define arch checks\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); @@ -73,9 +43,10 @@ def make_set_simd(dom) : tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; tempstring = tempstring + "])\n"; + tempstring = tempstring +'\ndnl main set_simd_flags\n'; tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; tempstring = tempstring + "[\n"; - tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + #tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; tempstring = tempstring + " indCC=no\n"; tempstring = tempstring + " indCXX=no\n"; @@ -96,29 +67,33 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; - tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; - tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; + tempstring = tempstring + " cf_with_lv_arch=\""; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + arch + " "; + tempstring = tempstring[0:-1] + "\"\n"; tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; - tempstring = tempstring + " fi\n"; + + tempstring = tempstring +'\ndnl init LV_HAVE_XXX and then try to add archs\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; - tempstring = tempstring + " case \"$MD_CPU\" in\n"; - tempstring = tempstring + " (x86)\n" for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); - if atype == "x86": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + if atype != "all": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); + tempstring = tempstring +'\ndnl add in flags for arch ' + arch + '\n'; overrule = domarch.getElementsByTagName("overrule"); if overrule: overrule = str(overrule[0].firstChild.data); @@ -131,139 +106,57 @@ def make_set_simd(dom) : overrule_val = ""; flag = domarch.getElementsByTagName("flag"); flag = str(flag[0].firstChild.data); - if atype == "x86": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" + if atype != "all": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; tempstring = tempstring + " fi\n" - - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - elif atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - - tempstring = tempstring + " (powerpc)\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - if atype == "powerpc": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype == "powerpc": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - elif atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " (*)\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + else: + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " esac\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index bc8f959af..242a8dee7 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -5,12 +5,9 @@ import re import string from xml.dom import minidom from volk_regexp import * -from make_cpuid_x86_c import make_cpuid_x86_c +from make_cpuid_c import make_cpuid_c from make_cpuid_h import make_cpuid_h -from make_proccpu_sim import make_proccpu_sim from make_set_simd import make_set_simd -from make_cpuid_generic_c import make_cpuid_generic_c -from make_cpuid_powerpc_c import make_cpuid_powerpc_c from make_registry import make_registry from make_h import make_h from make_init_h import make_init_h @@ -35,10 +32,7 @@ outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); -outfile_cpu_x86_c = open("../../lib/volk_cpu_x86.c", "w"); -outfile_cpu_generic_c = open("../../lib/volk_cpu_generic.c", "w"); -outfile_cpu_powerpc_c = open("../../lib/volk_cpu_powerpc.c", "w"); -outfile_proccpu_sim = open("../../lib/volk_proccpu_sim.c", "w"); +outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); outfile_mktables = open("../../lib/volk_mktables.c", "w"); @@ -226,21 +220,12 @@ outfile_mktables.close(); outfile_cpu_h.write(make_cpuid_h(filearchs)); outfile_cpu_h.close(); -outfile_cpu_x86_c.write(make_cpuid_x86_c(filearchs)); -outfile_cpu_x86_c.close(); - -outfile_proccpu_sim.write(make_proccpu_sim(filearchs)); -outfile_proccpu_sim.close(); +outfile_cpu_c.write(make_cpuid_c(filearchs)); +outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs)); outfile_set_simd.close(); -outfile_cpu_generic_c.write(make_cpuid_generic_c(filearchs)); -outfile_cpu_generic_c.close(); - -outfile_cpu_powerpc_c.write(make_cpuid_powerpc_c(filearchs)); -outfile_cpu_powerpc_c.close(); - outfile_config_in.write(make_config_in(filearchs)); outfile_config_in.close(); -- cgit From 2387618a492c73ec029599027daeaeebe937e7a7 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 18:58:42 -0800 Subject: comments --- volk/include/volk/make_set_simd.py | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 8aef5bb16..08b0f0937 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -15,16 +15,6 @@ # along with this program. If not, see . # -#TODO: add in checking for 32-bit and 64-bit compiler support, to pick -#which version to use. - -#for a given compiler: first try 64 bit, then fall back to 32 bit. -#to try 64 bit, just try compiling a program with -m64 -#if it passes, whee you're 64-bit -#if not, 32 -#this should probably be elsewhere and just referred in here -#actually, why the fuck does LF_CHECK_CC_FLAG([-m64]) work at all? it doesn't even COMPILE - from xml.dom import minidom def make_set_simd(dom) : -- cgit From d364316fa2e047890af09c7bdeed776d6391b3db Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 21:28:43 -0800 Subject: Removed some mktables stuff since it's passe --- volk/include/volk/make_mktables.py | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 volk/include/volk/make_mktables.py (limited to 'volk/include') diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py deleted file mode 100644 index 25da086fa..000000000 --- a/volk/include/volk/make_mktables.py +++ /dev/null @@ -1,33 +0,0 @@ - - -def make_mktables(funclist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstrgin = tempstring + '#include\n'; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + 'int main() {\n'; - tempstring = tempstring + ' int i = 0;\n'; - tempstring = tempstring + ' FILE* output;\n'; - tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; - tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; - tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; - - for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; - tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; - tempstring = tempstring + ' fclose(output);\n' - tempstring = tempstring + '}\n'; - return tempstring; - - - - - - - - -- cgit From fd03c0ee7cb9af71a5e0292569b626ba3dabd885 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 21:59:19 -0800 Subject: Moved the fn indices gen from volk.c to volk_registry.h so the qa code has access to the static stuff --- volk/include/volk/make_c.py | 18 ------------------ volk/include/volk/make_registry.py | 23 ++++++++++++++++++++++- volk/include/volk/volk_register.py | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 6e75067d0..f845ce7d4 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -43,24 +43,6 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : lindex = tempstring.rfind(","); tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); tempstring = tempstring + "};\n\n"; - - tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind]; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; - - tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; - tags_counter = tags_counter + 1; - - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); - tempstring = tempstring + "};\n\n"; tempstring = tempstring + retlist[i] + "inline " + funclist[i] + "_manual" + arched_arglist[i] + '\n'; tempstring = tempstring + "return " + funclist[i] + "_archs[volk_get_index(" + funclist[i] + "_indices, arch, " + funclist[i] + "_arch_defs)](" + my_arglist[i] + ");" + "\n}\n"; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 8457d61f3..2bc8bda28 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -2,7 +2,7 @@ from xml.dom import minidom from emit_omnilog import * import string -def make_registry(dom, funclist, fcountlist) : +def make_registry(dom, funclist, fcountlist, taglist) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; @@ -24,6 +24,27 @@ def make_registry(dom, funclist, fcountlist) : tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; counter = 0; + + for i in range(len(funclist)): + tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; + + tags_counter = 0; + for arch_list in fcountlist[i]: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + + tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; + tags_counter = tags_counter + 1; + + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n"; + + for fcount in fcountlist: tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; counter = counter + 1; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 242a8dee7..77290f12a 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -229,7 +229,7 @@ outfile_set_simd.close(); outfile_config_in.write(make_config_in(filearchs)); outfile_config_in.close(); -outfile_reg.write(make_registry(filearchs, functions, fcountlist)); +outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); outfile_h.write(make_h(functions, arched_arglist, retlist)); -- cgit From 88c389e1166cf5427cfabc012502337999c2b68f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 22:25:54 -0800 Subject: Revert "Removed some mktables stuff since it's passe" This reverts commit d364316fa2e047890af09c7bdeed776d6391b3db. --- volk/include/volk/make_mktables.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 volk/include/volk/make_mktables.py (limited to 'volk/include') diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py new file mode 100644 index 000000000..25da086fa --- /dev/null +++ b/volk/include/volk/make_mktables.py @@ -0,0 +1,33 @@ + + +def make_mktables(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstrgin = tempstring + '#include\n'; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + 'int main() {\n'; + tempstring = tempstring + ' int i = 0;\n'; + tempstring = tempstring + ' FILE* output;\n'; + tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; + tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; + tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; + + for func in funclist: + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; + tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; + tempstring = tempstring + ' fclose(output);\n' + tempstring = tempstring + '}\n'; + return tempstring; + + + + + + + + -- cgit From 1bf5a3ea4bbb9fd6baf72eb876778d2652c97f2c Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 22:28:38 -0800 Subject: reverted mktables deletion until i get my act together and make it go straight from python -> .h --- volk/include/volk/volk_register.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 77290f12a..460a11fab 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -18,9 +18,9 @@ from make_runtime_c import make_runtime_c from make_init_c import make_init_c from make_runtime import make_runtime from make_typedefs import make_typedefs -from make_mktables import make_mktables from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h +from make_mktables import make_mktables outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -213,9 +213,6 @@ for func in functions: fcountlist.append(fcount); taglist.append(tags); -outfile_mktables.write(make_mktables(functions)); -outfile_mktables.close(); - outfile_cpu_h.write(make_cpuid_h(filearchs)); outfile_cpu_h.close(); @@ -261,3 +258,6 @@ outfile_environment_c.close(); outfile_environment_h.write(make_environment_init_h()); outfile_environment_h.close(); + +outfile_mktables.write(make_mktables(functions)); +outfile_mktables.close(); -- cgit From 5d10579e79b35a3d80af967b73dfe72f7ce14740 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 23:37:12 -0800 Subject: volk: temporarily removed avx arch. when -mavx is used, gcc will promote SSE instructions to their AVX equivalents. while this is a benefit on AVX architectures, it makes it really difficult to selectively enable code based on architecture at runtime. GCC suggests compiling different files with different flags and loading them at runtime. GCC can take a flying leap. --- volk/include/volk/archs.xml | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index a19a5add9..70d55a8be 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -126,14 +126,4 @@ msse4.2 - - - 1 - 1 - c - 28 - mavx - - - -- cgit From 0eeeb636a89c5086293bae31511316e4200ad2f9 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 16 Feb 2011 17:20:09 -0800 Subject: Volk_runtime now does self-initialization. You can call volk_xxx_a16() just like in volk.c. The first run of each function does the rank_archs call. Subsequent calls proceed with no overhead. volk_init is still being generated but not used at all. --- volk/include/volk/make_runtime.py | 10 ++-------- volk/include/volk/make_runtime_c.py | 9 +++++---- volk/include/volk/volk_register.py | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py index 645b3aaee..91e703b24 100644 --- a/volk/include/volk/make_runtime.py +++ b/volk/include/volk/make_runtime.py @@ -4,7 +4,7 @@ from volk_regexp import * -def make_runtime(funclist) : +def make_runtime(funclist, arglist) : tempstring = ""; tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; @@ -18,14 +18,8 @@ def make_runtime(funclist) : tempstring = tempstring + '\n'; - tempstring = tempstring + "struct VOLK_RUNTIME {\n"; - for i in range(len(funclist)): - tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n"; - tempstring = tempstring + "};\n\n"; - - tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n" - tempstring = tempstring + "\nvoid volk_runtime_init();\n"; + tempstring = tempstring + "extern void (*" + funclist[i] + ")(" + arglist[i] + ");\n" tempstring = tempstring + emit_epilog(); tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py index 070df9ba7..0519dddf8 100644 --- a/volk/include/volk/make_runtime_c.py +++ b/volk/include/volk/make_runtime_c.py @@ -3,7 +3,7 @@ import string from volk_regexp import * -def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : +def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist) : tempstring = ""; tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; @@ -18,8 +18,6 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun for func in funclist: tempstring = tempstring + "#include\n" ; tempstring = tempstring + '\n'; - - tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n"; for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; @@ -42,6 +40,9 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; - tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n'; + tempstring = tempstring + " %s = %s_archs[volk_rank_archs(%s_arch_defs, volk_get_lvarch())];\n" % (funclist[i], funclist[i], funclist[i]) + tempstring = tempstring + " %s(%s);\n}\n\n" % (funclist[i], my_arglist[i]) + + tempstring = tempstring + "%s(*%s)(%s) = &default_acquire_%s;\n\n" % (retlist[i], funclist[i], my_argtypelist[i], funclist[i]) return tempstring; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 460a11fab..b719042df 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -241,13 +241,13 @@ outfile_config_fixed.close(); outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); outfile_c.close(); -outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist)); outfile_runtime_c.close(); outfile_init_c.write(make_init_c(functions, filearchs)); outfile_init_c.close(); -outfile_runtime.write(make_runtime(functions)); +outfile_runtime.write(make_runtime(functions, my_argtypelist)); outfile_runtime.close(); outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); -- cgit From 26c81b79e28f3d4d9a5eb5b5a56338371ff118ce Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 16 Feb 2011 17:30:16 -0800 Subject: Fixed mktables for the old non-runtime volk. --- volk/include/volk/Makefile.am | 3 ++- volk/include/volk/make_mktables.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 7a5edd624..23822b229 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -133,7 +133,8 @@ volkinclude_HEADERS = \ VOLK_MKTABLES_SOURCES = \ $(platform_CODE) \ $(top_srcdir)/lib/volk_rank_archs.c \ - $(top_srcdir)/lib/volk_mktables.c + $(top_srcdir)/lib/volk_mktables.c \ + $(top_srcdir)/lib/volk_cpu.c volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py index 25da086fa..051ac268d 100644 --- a/volk/include/volk/make_mktables.py +++ b/volk/include/volk/make_mktables.py @@ -18,7 +18,7 @@ def make_mktables(funclist) : tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; tempstring = tempstring + ' fclose(output);\n' tempstring = tempstring + '}\n'; -- cgit From cef9e33e01e946d82564f517de501cafcb5b1f32 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 9 Mar 2011 17:30:02 -0800 Subject: Volk: first steps to conditional compilation/multiple obj files. --- volk/include/volk/Makefile.am | 6 ------ volk/include/volk/archs.xml | 8 ++++++++ volk/include/volk/make_c.py | 6 +++--- volk/include/volk/make_environment_init_c.py | 5 ++--- volk/include/volk/make_h.py | 1 - volk/include/volk/make_init_c.py | 4 ++-- volk/include/volk/make_registry.py | 15 +++++++-------- volk/include/volk/make_runtime.py | 1 - volk/include/volk/make_runtime_c.py | 7 +++---- volk/include/volk/make_set_simd.py | 18 ++++++++++-------- volk/include/volk/volk_16i_branch_4_state_8_a16.h | 4 ++-- volk/include/volk/volk_16i_convert_8i_a16.h | 2 +- volk/include/volk/volk_16i_convert_8i_u.h | 2 +- volk/include/volk/volk_16i_max_star_16i_a16.h | 4 ++-- .../volk/volk_16i_max_star_horizontal_16i_a16.h | 4 ++-- .../include/volk/volk_16i_permute_and_scalar_add_a16.h | 4 ++-- volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 6 +++--- volk/include/volk/volk_16i_s32f_convert_32f_u.h | 6 +++--- volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h | 4 ++-- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 4 ++-- volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h | 8 ++++---- .../include/volk/volk_16ic_deinterleave_real_16i_a16.h | 6 +++--- volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 6 +++--- volk/include/volk/volk_16ic_magnitude_16i_a16.h | 8 ++++---- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 6 +++--- .../volk/volk_16ic_s32f_deinterleave_real_32f_a16.h | 6 +++--- volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h | 8 ++++---- volk/include/volk/volk_16u_byteswap_a16.h | 6 +++--- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 4 ++-- volk/include/volk/volk_32f_convert_64f_a16.h | 2 +- volk/include/volk/volk_32f_convert_64f_u.h | 2 +- volk/include/volk/volk_32f_index_max_16u_a16.h | 6 +++--- .../include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 4 ++-- .../volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_normalize_a16.h | 6 +++--- volk/include/volk/volk_32f_s32f_power_32f_a16.h | 14 +++++++------- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_sqrt_32f_a16.h | 6 +++--- .../include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_add_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_divide_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 8 ++++---- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 8 ++++---- volk/include/volk/volk_32f_x2_interleave_32fc_a16.h | 4 ++-- volk/include/volk/volk_32f_x2_max_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_min_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 6 +++--- .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 4 ++-- volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h | 6 +++--- volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h | 4 ++-- volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h | 4 ++-- .../include/volk/volk_32fc_deinterleave_real_32f_a16.h | 4 ++-- .../include/volk/volk_32fc_deinterleave_real_64f_a16.h | 4 ++-- volk/include/volk/volk_32fc_index_max_16u_a16.h | 4 ++-- volk/include/volk/volk_32fc_magnitude_32f_a16.h | 8 ++++---- volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 14 +++++++------- .../volk/volk_32fc_s32f_deinterleave_real_16i_a16.h | 4 ++-- volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h | 8 ++++---- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 8 ++++---- .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 8 ++++---- .../volk_32fc_s32f_x2_power_spectral_density_32f_a16.h | 8 ++++---- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 6 +++--- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 4 ++-- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 10 +++++----- volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 6 +++--- ...volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 4 ++-- volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h | 4 ++-- volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 4 ++-- volk/include/volk/volk_32i_s32f_convert_32f_u.h | 4 ++-- volk/include/volk/volk_32i_x2_and_32i_a16.h | 6 +++--- volk/include/volk/volk_32i_x2_or_32i_a16.h | 6 +++--- volk/include/volk/volk_32u_byteswap_a16.h | 4 ++-- volk/include/volk/volk_32u_popcnt_a16.h | 4 ++-- volk/include/volk/volk_64f_convert_32f_a16.h | 2 +- volk/include/volk/volk_64f_convert_32f_u.h | 2 +- volk/include/volk/volk_64f_x2_max_64f_a16.h | 4 ++-- volk/include/volk/volk_64f_x2_min_64f_a16.h | 4 ++-- volk/include/volk/volk_64u_byteswap_a16.h | 4 ++-- volk/include/volk/volk_64u_popcnt_a16.h | 4 ++-- volk/include/volk/volk_8i_convert_16i_a16.h | 6 +++--- volk/include/volk/volk_8i_convert_16i_u.h | 4 ++-- volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 6 +++--- volk/include/volk/volk_8i_s32f_convert_32f_u.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h | 4 ++-- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 6 +++--- .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 6 +++--- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 4 ++-- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 4 ++-- volk/include/volk/volk_common.h | 2 +- volk/include/volk/volk_register.py | 11 ++++++++--- 100 files changed, 275 insertions(+), 271 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 23822b229..caf707138 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,7 +26,6 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk BUILT_SOURCES: \ - volk_config.h \ volk_tables.h volkinclude_HEADERS = \ @@ -34,7 +33,6 @@ volkinclude_HEADERS = \ volk_common.h \ volk_config_fixed.h \ volk_runtime.h \ - volk_config.h \ volk_tables.h \ volk_typedefs.h \ volk_registry.h \ @@ -143,12 +141,8 @@ volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -volk_config.h: $(top_builddir)/volk_config.h - cp $^ $(top_builddir)/include/volk/$@ - distclean-local: rm -f volk_config_fixed.h - rm -f volk_config.h rm -f volk_cpu.h rm -f volk.h rm -f volk_registry.h diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index 70d55a8be..2547df590 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -126,4 +126,12 @@ msse4.2 + + 1 + 1 + c + 28 + mavx + + diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f845ce7d4..89bf9ea1a 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -30,12 +30,12 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; tags_counter = tags_counter + 1; diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py index e06c7f246..263d5bcd1 100644 --- a/volk/include/volk/make_environment_init_c.py +++ b/volk/include/volk/make_environment_init_c.py @@ -4,13 +4,12 @@ def make_environment_init_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include\n" - tempstring = tempstring + "#include\n" for domarch in dom: arch = str(domarch.attributes["name"].value); incs = domarch.getElementsByTagName("include"); for inc in incs: my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + "#include<" + my_inc + ">\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + '\n\n'; @@ -21,7 +20,7 @@ def make_environment_init_c(dom) : envs = domarch.getElementsByTagName("environment"); for env in envs: cmd = str(env.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + " " + cmd + "\n"; tempstring = tempstring + "#endif\n" diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 81d9ad401..6aea441b7 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -11,7 +11,6 @@ def make_h(funclist, arched_arglist, retlist) : tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n' tempstring = tempstring + emit_prolog() diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py index 330e19592..0a7010cd6 100644 --- a/volk/include/volk/make_init_c.py +++ b/volk/include/volk/make_init_c.py @@ -12,7 +12,7 @@ def make_init_c(funclist, dom) : incs = domarch.getElementsByTagName("include"); for inc in incs: my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + "#include<" + my_inc + ">\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + '\n\n'; @@ -32,7 +32,7 @@ def make_init_c(funclist, dom) : for env in envs: cmd = str(env.firstChild.data); tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + " " + cmd + "\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + " }\n"; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 2bc8bda28..7fbe9a8b0 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -7,7 +7,6 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; - tempstring = tempstring +'#include\n'; tempstring = tempstring +'#include\n'; tempstring = tempstring + emit_prolog(); tempstring = tempstring + '\n' @@ -17,7 +16,7 @@ def make_registry(dom, funclist, fcountlist, taglist) : for domarch in dom: arch = str(domarch.attributes["name"].value); - tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring +"#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; tempstring = tempstring +"#else\n"; tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; @@ -30,12 +29,12 @@ def make_registry(dom, funclist, fcountlist, taglist) : tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; tags_counter = tags_counter + 1; @@ -59,11 +58,11 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); tempstring = tempstring + ",\n" for arch_list in fcount: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n" tempstring = tempstring + " (1 << LV_" for ind in range(len(arch_list)): diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py index 91e703b24..d468487d7 100644 --- a/volk/include/volk/make_runtime.py +++ b/volk/include/volk/make_runtime.py @@ -11,7 +11,6 @@ def make_runtime(funclist, arglist) : tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + emit_prolog(); diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py index 0519dddf8..99cdf395f 100644 --- a/volk/include/volk/make_runtime_c.py +++ b/volk/include/volk/make_runtime_c.py @@ -9,7 +9,6 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + "#include\n"; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; @@ -24,12 +23,12 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; tags_counter = tags_counter + 1; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 08b0f0937..da631d217 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -54,7 +54,7 @@ def make_set_simd(dom) : tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; if str(domarch.attributes["type"].value) == "all": arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; + tempstring = tempstring + " AC_DEFINE(LV_MAKE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; @@ -68,11 +68,11 @@ def make_set_simd(dom) : tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; tempstring = tempstring + " fi\n"; - tempstring = tempstring +'\ndnl init LV_HAVE_XXX and then try to add archs\n'; + tempstring = tempstring +'\ndnl init LV_MAKE_XXX and then try to add archs\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=no\n"; for domarch in dom: arch = str(domarch.attributes["name"].value); @@ -121,10 +121,9 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + #tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" tempstring = tempstring + " indCXX=no\n" @@ -140,13 +139,16 @@ def make_set_simd(dom) : tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " fi\n" tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indLV_ARCH=no\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h index 3437c1a6b..5eb03b346 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a16.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h @@ -8,7 +8,7 @@ -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -137,7 +137,7 @@ static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* s #endif /*LV_HAVE_SSEs*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h index 73e45ad63..4d51e5903 100644 --- a/volk/include/volk/volk_16i_convert_8i_a16.h +++ b/volk/include/volk/volk_16i_convert_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the input 16 bit integer data into 8 bit integer data diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h index 5fc792b56..df1084fe0 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the input 16 bit integer data into 8 bit integer data diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h index ff57bd2a1..063444279 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a16.h +++ b/volk/include/volk/volk_16i_max_star_16i_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -85,7 +85,7 @@ static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, #endif /*LV_HAVE_SSSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h index 695e08dbf..ece6adb40 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -109,7 +109,7 @@ static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, #endif /*LV_HAVE_SSSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h index e52a949fb..ae1a18157 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h @@ -8,7 +8,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include #include @@ -116,7 +116,7 @@ static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, sho #endif /*LV_HAVE_SSEs*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h index 83fd26ff9..09bc252f0 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -58,7 +58,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, con } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! @@ -94,7 +94,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 16 bit input data buffer diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h index 8f0dd0083..d34acc091 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -59,7 +59,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! @@ -96,7 +96,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 16 bit input data buffer diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h index e4ec5ab4e..94e5eb986 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h @@ -9,7 +9,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include @@ -167,7 +167,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* #endif /*LV_HAVE_SSE2*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h index 5744ca3a6..c157bf64a 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h @@ -9,7 +9,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include #include @@ -111,7 +111,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* #endif /*LV_HAVE_SSE2*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h index 7e08bf182..227a92303 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data @@ -52,7 +52,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data @@ -120,7 +120,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int1 } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data \param complexVector The complex input vector @@ -140,7 +140,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, i } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h index 388c00592..35d0e8be2 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into I vector data @@ -47,7 +47,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, c #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex 16 bit vector into I vector data @@ -96,7 +96,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, co } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index 55a25702e..bdf5fc162 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data @@ -59,7 +59,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data \param complexVector The complex input vector @@ -77,7 +77,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index bdcace750..e75d54ec4 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -76,7 +76,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -153,7 +153,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -173,7 +173,7 @@ static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC_DISABLED +#ifdef LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index 606de2fc5..dcb2499fa 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data @@ -68,7 +68,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data \param complexVector The complex input vector @@ -89,7 +89,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index 62331e496..f21fe77f8 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 16 bit vector into I float vector data @@ -52,7 +52,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffe } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 16 bit vector into I float vector data @@ -99,7 +99,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I float vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index ae64efbeb..388d2ebcd 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -70,7 +70,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -140,7 +140,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -161,7 +161,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVect } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC_DISABLED +#ifdef LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h index c8128dbab..f393c05c5 100644 --- a/volk/include/volk/volk_16u_byteswap_a16.h +++ b/volk/include/volk/volk_16u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -43,7 +43,7 @@ static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. \param intsToSwap The vector of data to byte swap @@ -61,7 +61,7 @@ static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 4a3588e6d..6a85e066e 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Accumulates the values in the input buffer @@ -42,7 +42,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Accumulates the values in the input buffer \param result The accumulated result diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h index c303dc118..8ca83220b 100644 --- a/volk/include/volk/volk_32f_convert_64f_a16.h +++ b/volk/include/volk/volk_32f_convert_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the float values into double values diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h index a825767de..387baa3b9 100644 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the float values into double values diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index d070e17d5..3934d2db7 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { @@ -63,7 +63,7 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const #endif /*LV_HAVE_SSE4_1*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { @@ -122,7 +122,7 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h index ff4d5b19c..6efd21a37 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. @@ -78,7 +78,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. \param outputVector The byte-aligned vector where the results will be stored. diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 168245d65..55d4e0319 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the spectral noise floor of an input power spectrum @@ -116,7 +116,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the spectral noise floor of an input power spectrum diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index d6b16e336..9d1d0ef4d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value @@ -44,7 +44,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, con } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index 4d306e53c..06228ef7d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value @@ -45,7 +45,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index ae874fd7b..82c74bf44 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value @@ -40,7 +40,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, con } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index 561fcd800..253a48ae3 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value @@ -41,7 +41,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index f64f2a213..8dab0cdf4 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 420693571..72b193c9d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value @@ -52,7 +52,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h index 0850cddf7..e6195cd32 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a16.h +++ b/volk/include/volk/volk_32f_s32f_normalize_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) @@ -41,7 +41,7 @@ static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Normalizes the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -60,7 +60,7 @@ static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const f } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Normalizes the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h index 3ed594d9a..ecff901e1 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -26,7 +26,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa float* cPtr = cVector; const float* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; @@ -62,10 +62,10 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -83,7 +83,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* float* cPtr = cVector; const float* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; @@ -119,7 +119,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Takes each the input vector value to the specified power and stores the results in the return vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 32f4fa067..48d2fe1fe 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Calculates the standard deviation of the input buffer using the supplied mean @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the standard deviation of the input buffer using the supplied mean @@ -111,7 +111,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the standard deviation of the input buffer using the supplied mean \param stddev The calculated standard deviation diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h index 513c2cffe..a9ce76f88 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a16.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Sqrts the two input vectors and store their results in the third vector @@ -40,7 +40,7 @@ static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVecto } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Sqrts the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -58,7 +58,7 @@ static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aV } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); /*! \brief Sqrts the two input vectors and store their results in the third vector diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index 278089841..f1cb2ae0e 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Calculates the standard deviation and mean of the input buffer @@ -78,7 +78,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the standard deviation and mean of the input buffer @@ -134,7 +134,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the standard deviation and mean of the input buffer \param stddev The calculated standard deviation diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h index d0d0e0a0e..2de6a6644 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Adds the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Adds the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Adds the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h index d844e25b0..1603e78de 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Divides the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* a } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Divides the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const floa } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Divides the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index 61aa56815..d13f12e51 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -4,7 +4,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { @@ -24,7 +24,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const fl #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { @@ -73,7 +73,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -120,7 +120,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 8469a3cea..7c1136a67 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -4,7 +4,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { @@ -24,7 +24,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { @@ -73,7 +73,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -120,7 +120,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h index 29c9392df..f3731fa2a 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Interleaves the I & Q vector data into the complex vector @@ -48,7 +48,7 @@ static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Interleaves the I & Q vector data into the complex vector. \param iBuffer The I buffer data to be interleaved diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h index 26e7f1246..60be6e36d 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -67,7 +67,7 @@ static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h index 23bae044c..3b8291531 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -67,7 +67,7 @@ static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h index a0dcfa86e..cef17f5a6 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplys the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const fl } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplys the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index 30306774d..e98735245 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. @@ -62,7 +62,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexV } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. @@ -127,7 +127,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. \param iBuffer The I buffer data to be interleaved diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h index 7404bfe79..c01f2c1f3 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Subtracts bVector form aVector and store their results in the cVector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Subtracts bVector form aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const fl } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Subtracts bVector form aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h index af9e39537..6e446cbef 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h @@ -9,7 +9,7 @@ #define MAX(X,Y) ((X) > (Y)?(X):(Y)) #endif -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -98,7 +98,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* sr #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h index 514998800..846315a4a 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies the input complex vector with the input float vector and store their results in the third vector @@ -56,7 +56,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector \param cVector The vector where the results will be stored @@ -76,7 +76,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h index 84d2576ed..3e7c3fa28 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector into I & Q vector data @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h index 34262a7af..945a26742 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, doubl } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h index 9838ec88b..3c3fb2583 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector into I vector data @@ -44,7 +44,7 @@ static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h index af392d074..40c1a7a46 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex vector into I vector data @@ -42,7 +42,7 @@ static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, con } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h index 532ae4e7c..0ad1edbe9 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a16.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -188,7 +188,7 @@ static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32f #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { float sq_dist = 0.0; float max = 0.0; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h index be7216dce..946190e41 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a16.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -48,7 +48,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, cons } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -96,7 +96,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -115,7 +115,7 @@ static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h index e9f74438d..55b1b6c70 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -27,7 +27,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -67,10 +67,10 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -89,7 +89,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -131,7 +131,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief performs the atan2 on the input vector and stores the results in the output vector. \param outputVector The vector where the results will be stored. diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 31465bff9..1e3e61e08 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data @@ -54,7 +54,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 530359600..14318ab01 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector @@ -60,7 +60,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector @@ -120,7 +120,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -140,7 +140,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVe } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h index 3507fdb3c..90cc4f5e7 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -4,10 +4,10 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -25,7 +25,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Takes each the input complex vector value to the specified power and stores the results in the return vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h index 39d8f7aa2..03da069c2 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -24,7 +24,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut float* destPtr = logPowerOutput; uint64_t number = 0; const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 magScalar = _mm_set_ps1(10.0); magScalar = _mm_div_ps(magScalar, logf4(magScalar)); @@ -88,7 +88,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the log10 power value for each input point \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h index 0120b5307..5bcd7f7c4 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -27,7 +27,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* const float iRBW = 1.0 / rbw; const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 magScalar = _mm_set_ps1(10.0); magScalar = _mm_div_ps(magScalar, logf4(magScalar)); @@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the log10 power value divided by the RBW for each input point \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index a01971df3..2e5036f69 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -59,7 +59,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE && LV_HAVE_64 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -202,7 +202,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul } #endif -#if LV_HAVE_SSE && LV_HAVE_32 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 2fa5918cc..69781f0fb 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -57,7 +57,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index 9a7b65ab4..9bfb88934 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -56,7 +56,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE && LV_HAVE_64 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -192,7 +192,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #endif -#if LV_HAVE_SSE && LV_HAVE_32 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -316,7 +316,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, cons #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -373,7 +373,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h index b4214f5d2..72010b855 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Multiplies the two input complex vectors and stores their results in the third vector @@ -53,7 +53,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -73,7 +73,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h index 6a863b16d..910f51679 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -105,7 +105,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h index 406097fc8..551f3cb53 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -91,7 +91,7 @@ static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h index 0fcadd9cb..b744c7197 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -47,7 +47,7 @@ static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 32 bit input data buffer diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h index 1dd6422f8..d8afd218c 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -48,7 +48,7 @@ static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const i #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 32 bit input data buffer diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h index 3baa1d856..4d50efd32 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a16.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Ands the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Ands the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32 } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Ands the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h index 0be22f00a..9edbdbafd 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a16.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Ors the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* a } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Ors the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_ } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Ors the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h index 7556ec7b1..dc5cedab9 100644 --- a/volk/include/volk/volk_32u_byteswap_a16.h +++ b/volk/include/volk/volk_32u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -51,7 +51,7 @@ static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int32_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h index f6e25e4e8..0d8b48fd5 100644 --- a/volk/include/volk/volk_32u_popcnt_a16.h +++ b/volk/include/volk/volk_32u_popcnt_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { @@ -23,7 +23,7 @@ static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t val #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 +#ifdef LV_HAVE_SSE4_2 #include diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h index 7dca065f0..cfcdbdc3a 100644 --- a/volk/include/volk/volk_64f_convert_32f_a16.h +++ b/volk/include/volk/volk_64f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the double values into float values diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h index 6338c1433..5c323230a 100644 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the double values into float values diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h index 4b0c1f5f1..21f488bf7 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a16.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* a } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h index aa961e384..8711a0eae 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a16.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* a } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h index 0eefe0138..b4bed8451 100644 --- a/volk/include/volk/volk_64u_byteswap_a16.h +++ b/volk/include/volk/volk_64u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -59,7 +59,7 @@ static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int64_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h index 59511dc29..b416d052e 100644 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { @@ -36,7 +36,7 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 && LV_HAVE_64 +#if defined(LV_HAVE_SSE4_2) && defined(LV_HAVE_64) #include diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h index 3d7045753..260ac40a1 100644 --- a/volk/include/volk/volk_8i_convert_16i_a16.h +++ b/volk/include/volk/volk_8i_convert_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -47,7 +47,7 @@ static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const i } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer @@ -65,7 +65,7 @@ static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h index bcff13406..7d7104f52 100644 --- a/volk/include/volk/volk_8i_convert_16i_u.h +++ b/volk/include/volk/volk_8i_convert_16i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -48,7 +48,7 @@ static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h index 99a24ec10..9991b150e 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -66,7 +66,7 @@ static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, cons } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer @@ -86,7 +86,7 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h index 1e30957e8..3cd6bb67c 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -67,7 +67,7 @@ static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h index 91c9b2c58..249acab49 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data @@ -51,7 +51,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h index bf3dc20dd..7b64b37c5 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data @@ -42,7 +42,7 @@ static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, c #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h index 13de79423..a1abad487 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 8 bit vector into I vector data @@ -43,7 +43,7 @@ static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, cons } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index 22c3ebb23..80032d2fe 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data @@ -74,7 +74,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data @@ -136,7 +136,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index 5f1430394..47a968ac1 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I float vector data @@ -61,7 +61,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 8 bit vector into I float vector data @@ -107,7 +107,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I float vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index d9cacbf46..014f662a3 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector @@ -68,7 +68,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector \param cVector The complex vector where the results will be stored diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index 6ec923a4f..ccf5eaa9d 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector @@ -87,7 +87,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector \param cVector The complex vector where the results will be stored diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 6f444ad89..0218e668c 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -2,7 +2,7 @@ #define INCLUDED_LIBVECTOR_COMMON_H #include -#if LV_HAVE_MMX +#ifdef LV_HAVE_MMX #include union bit128{ uint16_t i16[8]; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index b719042df..20b77da6c 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -21,6 +21,7 @@ from make_typedefs import make_typedefs from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables +from make_makefile_am import make_makefile_am outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -33,11 +34,12 @@ outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); -outfile_config_in = open("../../volk_config.h.in", "w"); +#outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); +outfile_makefile_am = open("../../lib/Makefile.am", "w"); infile = open("Makefile.am", "r"); @@ -223,8 +225,8 @@ outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs)); outfile_set_simd.close(); -outfile_config_in.write(make_config_in(filearchs)); -outfile_config_in.close(); +#outfile_config_in.write(make_config_in(filearchs)); +#outfile_config_in.close(); outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); @@ -261,3 +263,6 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); + +outfile_makefile_am.write(make_makefile_am(filearchs)) +outfile_makefile_am.close() -- cgit From 258186d5ca2e811ced7ea637fd16e3ed3bb5573e Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 22 Mar 2011 16:55:07 -0700 Subject: Interim commit. --- volk/include/volk/make_makefile_am.py | 140 +++++++++++++++++++++ volk/include/volk/make_set_simd.py | 13 +- .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 4 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 4 +- volk/include/volk/volk_64u_popcnt_a16.h | 2 +- volk/include/volk/volk_register.py | 43 ++++++- 6 files changed, 195 insertions(+), 11 deletions(-) create mode 100644 volk/include/volk/make_makefile_am.py (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py new file mode 100644 index 000000000..89ea1f4b6 --- /dev/null +++ b/volk/include/volk/make_makefile_am.py @@ -0,0 +1,140 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from xml.dom import minidom + +def make_makefile_am(dom, machines): + tempstring = r""" +# This file is automatically generated by make_makefile_am.py. +# Do not edit this file. + +include $(top_srcdir)/Makefile.common + +#FIXME: forcing the top_builddir for distcheck seems like a bit +# of a hack. Figure out the right way to do this to find built +# volk_config.h and volk_tables.h + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ + -I$(top_builddir)/include \ + $(WITH_INCLUDES) + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la + +EXTRA_DIST = \ + volk_mktables.c \ + volk_rank_archs.h \ + volk_proccpu_sim.c \ + gcc_x86_cpuid.h + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +libvolk_runtime_la_SOURCES = \ + $(platform_CODE) \ + volk_runtime.c \ + volk_rank_archs.c \ + volk_cpu.c + +libvolk_la_SOURCES = \ + $(platform_CODE) \ + volk.c \ + volk_environment_init.c + +volk_orc_LDFLAGS = \ + $(ORC_LDFLAGS) \ + -lorc-0.4 + +volk_orc_LIBADD = \ + ../orc/libvolk_orc.la + +""" + + #here be dragons + for machine_name in machines: + tempstring += "if LV_MACHINE_" + machine_name.swapcase() + tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + + + tempstring += """ +if LV_HAVE_ORC +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) +else +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_la_LIBADD = +endif + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +#libvolk_qa_la_SOURCES = \ +# qa_utils.cc + +#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost + +#libvolk_qa_la_LIBADD = \ +# libvolk.la \ +# libvolk_runtime.la + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_utils.h + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + testqa + +testqa_SOURCES = testqa.cc qa_utils.cc +testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) +testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) +if LV_HAVE_ORC +testqa_LDADD = \ + libvolk.la \ + libvolk_runtime.la \ + ../orc/libvolk_orc.la +else +testqa_LDADD = \ + libvolk.la +endif + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +""" + + + return tempstring diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index da631d217..5a848e59e 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -17,7 +17,7 @@ from xml.dom import minidom -def make_set_simd(dom) : +def make_set_simd(dom, machines) : tempstring = ""; tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; @@ -148,7 +148,16 @@ def make_set_simd(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; - + + tempstring += "\n" + #now we can define the machines we're compiling + for machine_name in machines: + tempstring += " AM_CONDITIONAL(LV_MACHINE_" + machine_name.swapcase() + ", " + marchlist = machines[machine_name] + for march in marchlist: + tempstring += "test \"$LV_MAKE_" + march.swapcase() + "\" == \"yes\" && " + + tempstring += "test true)\n" #just so we don't have to detect the last one in the group, i know tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index 2e5036f69..d78faf5b5 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -59,7 +59,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) +#if LV_HAVE_SSE && LV_HAVE_64 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -202,7 +202,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul } #endif -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) +#if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index 9bfb88934..b7b9768ab 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -56,7 +56,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) +#if LV_HAVE_SSE && LV_HAVE_64 static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -192,7 +192,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #endif -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) +#if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h index b416d052e..8b92e91a1 100644 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -36,7 +36,7 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE4_2) && defined(LV_HAVE_64) +#if LV_HAVE_SSE4_2 && LV_HAVE_64 #include diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 20b77da6c..d5a4a968f 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -22,6 +22,7 @@ from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables from make_makefile_am import make_makefile_am +import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -91,7 +92,41 @@ for arch in archs: archs_or = archs_or[0:len(archs_or)-1]; archs_or = archs_or + ")"; - +#get machine list and parse to a list of machines, each with a list of archs (none of this DOM crap) +machine_str_dict = {} +mfile = minidom.parse("machines.xml"); +filemachines = mfile.getElementsByTagName("machine") + +for filemachine in filemachines: + machine_str_dict[str(filemachine.attributes["name"].value)] = str(filemachine.getElementsByTagName("archs")[0].firstChild.data).split() + +#all right now you have a dict of arch lists +#next we expand it +#this is an expanded list accounting for the OR syntax +#TODO: make this work for multiple "|" machines +machines = {} +already_done = False +for machine_name in machine_str_dict: + already_done = False + marchlist = machine_str_dict[machine_name] + for march in marchlist: + or_marchs = march.split("|") + if len(or_marchs) > 1: + marchlist.remove(march) + for or_march in or_marchs: + tempmarchlist = copy.deepcopy(marchlist) + tempmarchlist.append(or_march) + machines[machine_name + "_" + or_march] = tempmarchlist + already_done = True + + if not already_done: + machines[machine_name] = marchlist + +#for machine_name in machines: +# print machine_name + ": " + str(machines[machine_name]) + +#ok, now we have all the machines we're going to build. next step is to generate a Makefile.am where they're all laid out and compiled + taglist = []; fcountlist = []; arched_arglist = []; @@ -107,7 +142,7 @@ for func in functions: sourcefile = infile_source.readlines(); infile_source.close(); for line in sourcefile: - +#FIXME: make it work for multiple #if define()s archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); if archline: arch = archline.group(0); @@ -222,7 +257,7 @@ outfile_cpu_h.close(); outfile_cpu_c.write(make_cpuid_c(filearchs)); outfile_cpu_c.close(); -outfile_set_simd.write(make_set_simd(filearchs)); +outfile_set_simd.write(make_set_simd(filearchs, machines)); outfile_set_simd.close(); #outfile_config_in.write(make_config_in(filearchs)); @@ -264,5 +299,5 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); -outfile_makefile_am.write(make_makefile_am(filearchs)) +outfile_makefile_am.write(make_makefile_am(filearchs, machines)) outfile_makefile_am.close() -- cgit From 5fffe801f95f2ef8bddf51aea8ed260eae0bf7b8 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 13 Apr 2011 18:32:28 -0700 Subject: Volk: make_makefile_am.py changes to generate cflags, ldflags. no conditional linking yet. --- volk/include/volk/make_makefile_am.py | 19 +++++++++++++++---- volk/include/volk/volk_register.py | 8 ++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 89ea1f4b6..c44fe11af 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -17,7 +17,7 @@ from xml.dom import minidom -def make_makefile_am(dom, machines): +def make_makefile_am(dom, machines, archflags_dict): tempstring = r""" # This file is automatically generated by make_makefile_am.py. # Do not edit this file. @@ -67,9 +67,20 @@ volk_orc_LIBADD = \ """ #here be dragons - for machine_name in machines: - tempstring += "if LV_MACHINE_" + machine_name.swapcase() - tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + for machine_name in machines: + tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" + tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + for arch in machines[machine_name]: + if archflags_dict[arch] != "none": + tempstring += "-" + archflags_dict[arch] + " " + + tempstring += "\nlibvolk_" + machine_name + "_la_CFLAGS = " + for arch in machines[machine_name]: + if archflags_dict[arch] != "none": + tempstring += "-DLV_HAVE_" + arch.swapcase() + " " + tempstring += "\nlibvolk_" + machine_name + "_la_SOURCES = $(libvolk_la_SOURCES)" + tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".la" + tempstring += "\nendif\n" tempstring += """ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index d5a4a968f..10610dcfe 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -83,8 +83,12 @@ for arch in archs: a_var = re.search("^\$", arch); if a_var: archs.remove(arch); + + - +archflags_dict = {} +for filearch in filearchs: + archflags_dict[str(filearch.attributes["name"].value)] = str(filearch.getElementsByTagName("flag")[0].firstChild.data) archs_or = "(" for arch in archs: @@ -299,5 +303,5 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); -outfile_makefile_am.write(make_makefile_am(filearchs, machines)) +outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() -- cgit From 1877a842c81a78edd14d725b20a05b29b84c7b52 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 12:11:08 -0700 Subject: Volk: volk_machine structures generated, volk_machines is instantiated. Testing manually works OK. All that's left is top-level volk.c and Automake wrangling. --- volk/include/volk/machines.xml | 47 ++++++++++++ volk/include/volk/make_each_machine_c.py | 83 ++++++++++++++++++++++ volk/include/volk/make_machines_c.py | 57 +++++++++++++++ volk/include/volk/make_machines_h.py | 50 +++++++++++++ volk/include/volk/make_makefile_am.py | 13 ++-- volk/include/volk/make_typedefs.py | 2 +- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 3 +- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 4 +- volk/include/volk/volk_common.h | 11 ++- volk/include/volk/volk_regexp.py | 6 ++ volk/include/volk/volk_register.py | 17 +++++ 11 files changed, 278 insertions(+), 15 deletions(-) create mode 100644 volk/include/volk/machines.xml create mode 100644 volk/include/volk/make_each_machine_c.py create mode 100644 volk/include/volk/make_machines_c.py create mode 100644 volk/include/volk/make_machines_h.py (limited to 'volk/include') diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml new file mode 100644 index 000000000..8eed9e8d7 --- /dev/null +++ b/volk/include/volk/machines.xml @@ -0,0 +1,47 @@ + + + +generic + + + +generic 32|64 mmx + + + +generic 32|64 mmx sse + + + +generic 32|64 mmx sse sse2 + + + +generic 32|64 mmx sse sse2 sse3 + + + +generic 32|64 mmx sse sse2 sse3 ssse3 + + + +generic 32|64 mmx sse sse2 sse3 sse4_a popcount + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx + + + +generic altivec + + + diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py new file mode 100644 index 000000000..abf4bb2d5 --- /dev/null +++ b/volk/include/volk/make_each_machine_c.py @@ -0,0 +1,83 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * +import string +from emit_omnilog import * + +#ok todo list: +#put n_archs into the info struct so it doesn't have to be arch_defs[0]. + +def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): + tempstring = r""" +// This file is automatically generated by make_each_machine_c.py. +// Do not edit this file. +""" + for arch in archs: + tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" + + tempstring += """ +#include +#include +#include + +""" + for func in functions: + tempstring += "#include \n" + tempstring += "\n\n" + + tempstring += emit_prolog(); + + for i in range(len(functions)): + tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" + + tags_counter = 0 + for arch_list in fcountlist[i]: + ok = True + for arch in arch_list: + if arch.swapcase() not in archs: + ok = False + if ok: + tempstring += " " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tags_counter += 1 + + tempstring = strip_trailing(tempstring, ",") + tempstring += "};\n\n" + + + tempstring += "static unsigned int caps = " + for arch in archs: + tempstring += "(1 << LV_" + arch.swapcase() + ") + " + tempstring = strip_trailing(tempstring, " + ") + tempstring += ";\n" + + tempstring += "static const char* name = \"" + machine_name + "\";\n" + tempstring += "struct volk_machine volk_machine_" + machine_name + " = {\n" + tempstring += " caps,\n" + tempstring += " name,\n" + + for i in range(len(functions)): + tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs },\n" + tempstring += " " + functions[i] + "_archs,\n" + + tempstring = strip_trailing(tempstring, ",") + tempstring += "};\n" + tempstring += emit_epilog(); + + return tempstring + + diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py new file mode 100644 index 000000000..55c0f1c06 --- /dev/null +++ b/volk/include/volk/make_machines_c.py @@ -0,0 +1,57 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * + +def make_machines_c(machines): + tempstring = r""" +// This file is automatically generated by make_machines_c.py. +// Do not edit this file. + +#include +#include +#include + +volk_machine volk_machines[] = { +""" + for machine in machines: + tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" + tempstring += "volk_machine_" + machine + tempstring += "," + tempstring += "\n#endif\n" + + tempstring += r""" +}; + +""" + + for machine in machines: + tempstring += "#if LV_MACHINE_" + machine.swapcase() + "\n" + tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 1\n" + tempstring += "#else\n" + tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 0\n" + tempstring += "#endif\n" + + tempstring += """unsigned int n_volk_machines = +""" + for machine in machines: + tempstring += "(LV_MACHINE_" + machine.swapcase() + "_CNT) " + tempstring += "+ " + tempstring = tempstring[:-2] + tempstring += ";\n" + + return tempstring diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py new file mode 100644 index 000000000..68cdf3363 --- /dev/null +++ b/volk/include/volk/make_machines_h.py @@ -0,0 +1,50 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * + +def make_machines_h(functions, machines): + tempstring = r""" +// This file is automatically generated by make_machines_h.py. +// Do not edit this file. + +#ifndef INCLUDED_LIBVOLK_MACHINES_H +#define INCLUDED_LIBVOLK_MACHINES_H + +#include +#include + +struct volk_machine { + const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) + const char *name; +""" + for function in functions: + tempstring += "\n const struct volk_func_desc " + function + "_desc;\n" + tempstring += " const " + replace_volk.sub("p", function) + " *" + function + "_archs;\n" + + tempstring += r"""}; + +""" + for machine in machines: + tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" + tempstring += "extern const struct volk_machine volk_machine_" + machine + ";\n" + tempstring += """#endif\n""" + + tempstring += r""" +#endif //INCLUDED_LIBVOLK_MACHINES_H""" + + return tempstring diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index c44fe11af..1c4c8fe7d 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -69,17 +69,16 @@ volk_orc_LIBADD = \ #here be dragons for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + tempstring += "libvolk_" + machine_name + "_ar_LDFLAGS = " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " - tempstring += "\nlibvolk_" + machine_name + "_la_CFLAGS = " - for arch in machines[machine_name]: - if archflags_dict[arch] != "none": - tempstring += "-DLV_HAVE_" + arch.swapcase() + " " - tempstring += "\nlibvolk_" + machine_name + "_la_SOURCES = $(libvolk_la_SOURCES)" - tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".la" +# tempstring += "\nlibvolk_" + machine_name + "_ar_CFLAGS = " +# for arch in machines[machine_name]: +# tempstring += "-DLV_HAVE_" + arch.swapcase() + " " + tempstring += "\nlibvolk_" + machine_name + "_ar_SOURCES = libvolk_machine_" + machine_name + ".cc" + tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".ar" tempstring += "\nendif\n" diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py index fe81cb2b0..8f9f2b55e 100644 --- a/volk/include/volk/make_typedefs.py +++ b/volk/include/volk/make_typedefs.py @@ -16,7 +16,7 @@ def make_typedefs(funclist, retlist, my_argtypelist) : tempstring = tempstring + '\n'; for i in range(len(funclist)): - tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n"; + tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n"; tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index 014f662a3..9e8982e9b 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -23,7 +23,6 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values @@ -37,7 +36,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe y = _mm_sign_epi16(y, conjugateSign); // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); // Calculate the ar*(-ci) + cr*(ai) imagz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index ccf5eaa9d..fa58ff058 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -24,7 +24,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); for(;number < quarterPoints; number++){ @@ -39,7 +39,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t y = _mm_sign_epi16(y, conjugateSign); // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); // Calculate the ar*(-ci) + cr*(ai) imagz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 0218e668c..e050600f0 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_LIBVECTOR_COMMON_H -#define INCLUDED_LIBVECTOR_COMMON_H +#ifndef INCLUDED_LIBVOLK_COMMON_H +#define INCLUDED_LIBVOLK_COMMON_H #include #ifdef LV_HAVE_MMX @@ -15,4 +15,9 @@ union bit128{ }; #endif /*LV_HAVE_MMX*/ -#endif /*INCLUDED_LIBVECTOR_COMMON_H*/ +struct volk_func_desc { + const char **indices; + const int *arch_defs; +}; + +#endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py index 7b695cb3b..b83ce5206 100644 --- a/volk/include/volk/volk_regexp.py +++ b/volk/include/volk/volk_regexp.py @@ -1,4 +1,5 @@ import re +import string remove_after_underscore = re.compile("_.*"); space_remove = re.compile(" "); @@ -6,3 +7,8 @@ leading_space_remove = re.compile("^ *"); replace_arch = re.compile(", const char\* arch"); replace_bracket = re.compile(" {"); replace_volk = re.compile("volk"); + +def strip_trailing(tostrip, stripstr): + lindex = tostrip.rfind(stripstr) + tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, ""); + return tostrip diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 10610dcfe..9d33abe89 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -22,6 +22,9 @@ from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables from make_makefile_am import make_makefile_am +from make_machines_h import make_machines_h +from make_machines_c import make_machines_c +from make_each_machine_c import make_each_machine_c import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); @@ -41,6 +44,8 @@ outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); +outfile_machines_h = open("volk_machines.h", "w"); +outfile_machines_c = open("../../lib/volk_machines.c", "w"); infile = open("Makefile.am", "r"); @@ -305,3 +310,15 @@ outfile_mktables.close(); outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() + +outfile_machines_h.write(make_machines_h(functions, machines)) +outfile_machines_h.close() + +outfile_machines_c.write(make_machines_c(machines)) +outfile_machines_c.close() + +for machine in machines: + machine_c_filename = "../../lib/volk_machine_" + machine + ".c" + outfile_machine_c = open(machine_c_filename, "w") + outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) + outfile_machine_c.close() -- cgit From 8608fc3ae5f891eb5c11345a9b6102fdc8ae2393 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 17:51:28 -0700 Subject: Volk: volk.c is now generated. manual build works OK. --- volk/include/volk/make_c.py | 127 +++++++++++++++++++++-------------- volk/include/volk/make_h.py | 28 ++++---- volk/include/volk/make_init_c.py | 42 ------------ volk/include/volk/make_init_h.py | 26 ------- volk/include/volk/make_machines_c.py | 2 +- volk/include/volk/make_mktables.py | 33 --------- volk/include/volk/make_runtime.py | 27 -------- volk/include/volk/make_runtime_c.py | 47 ------------- volk/include/volk/volk_register.py | 51 +++----------- 9 files changed, 102 insertions(+), 281 deletions(-) delete mode 100644 volk/include/volk/make_init_c.py delete mode 100644 volk/include/volk/make_init_h.py delete mode 100644 volk/include/volk/make_mktables.py delete mode 100644 volk/include/volk/make_runtime.py delete mode 100644 volk/include/volk/make_runtime_c.py (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 89bf9ea1a..5f6669e40 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -1,55 +1,84 @@ -from xml.dom import minidom -import string +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + from volk_regexp import * +import string +from emit_omnilog import * +#ok todo list: +#put n_archs into the info struct so it doesn't have to be arch_defs[0]. -def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - for func in funclist: - tempstring = tempstring + "#include\n" ; - tempstring = tempstring + '\n'; - - tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n"; - tempstring = tempstring + " int i = 1;\n" - tempstring = tempstring + " for(;i +#include +#include +#include +#include +#include +#include +#include + +""" + tempstring += emit_prolog(); - tempstring = tempstring + retlist[i] + "inline " + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; +#OK here's the deal. the .h prototypes the functions. the .c impls them as fptrs, can use p_whatever. +#also .c impls the get_machine call +#also .c impls the default call for each fn + +#here do static fn get arch + tempstring += r""" +struct volk_machine *get_machine(void) { + extern struct volk_machine volk_machines[]; + extern unsigned int n_volk_machines; + static struct volk_machine *machine = NULL; - tempstring = tempstring + funclist[i] + "_archs[" + funclist[i] + "_func_table](" + my_arglist[i] + ");" + '\n'; - tempstring = tempstring + "}\n\n"; + if(machine != NULL) return machine; + else { + unsigned int max_score = 0; + int i; + for(i=0; i max_score) { + max_score = volk_machines[i].caps; + machine = &(volk_machines[i]); + } + } + } + printf("Using Volk machine: %s\n", machine->name); + return machine; + } +} + +""" - return tempstring; + for i in range(len(functions)): + tempstring += "void get_" + functions[i] + replace_arch.sub("", arched_arglist[i]) + "\n" + tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i]) + tempstring += " %s(%s);\n}\n\n" % (functions[i], my_arglist[i]) + tempstring += replace_volk.sub("p", functions[i]) + " " + functions[i] + " = &get_" + functions[i] + ";\n\n" + + + tempstring += emit_epilog(); + + return tempstring + + diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 6aea441b7..b235cd657 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -4,24 +4,24 @@ from volk_regexp import * -def make_h(funclist, arched_arglist, retlist) : +def make_h(funclist) : tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; + tempstring = tempstring + '/*this file is auto generated by make_h.py*/\n'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n' - tempstring = tempstring + emit_prolog() - tempstring = tempstring + '\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + emit_prolog(); - for i in range(len(retlist)): - tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; - tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n'; + tempstring = tempstring + '\n'; + + for i in range(len(funclist)): + tempstring = tempstring + "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n"; - return tempstring; + diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py deleted file mode 100644 index 0a7010cd6..000000000 --- a/volk/include/volk/make_init_c.py +++ /dev/null @@ -1,42 +0,0 @@ -from xml.dom import minidom - -def make_init_c(funclist, dom) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - incs = domarch.getElementsByTagName("include"); - for inc in incs: - my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + "#include<" + my_inc + ">\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + '\n\n'; - - tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n"; - tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n"; - tempstring = tempstring + " return &volk_runtime;\n"; - tempstring = tempstring + "}\n\n" - tempstring = tempstring + " void volk_runtime_init() {\nvolk_cpu_init();\n"; - - for func in funclist: - tempstring = tempstring + " volk_runtime." + func + " = default_acquire_" + func + ";\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - envs = domarch.getElementsByTagName("environment"); - for env in envs: - cmd = str(env.firstChild.data); - tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + " " + cmd + "\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + " }\n"; - - tempstring = tempstring + "}\n"; - - return tempstring diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py deleted file mode 100644 index 6dbe1c585..000000000 --- a/volk/include/volk/make_init_h.py +++ /dev/null @@ -1,26 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - - - -def make_init_h(funclist, arched_arglist, retlist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - - tempstring = tempstring + '\n'; - - tempstring = tempstring + emit_prolog(); - - for i in range(len(retlist)): - tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; - - tempstring= tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n"; - - return tempstring; diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py index 55c0f1c06..12eabf98d 100644 --- a/volk/include/volk/make_machines_c.py +++ b/volk/include/volk/make_machines_c.py @@ -26,7 +26,7 @@ def make_machines_c(machines): #include #include -volk_machine volk_machines[] = { +struct volk_machine volk_machines[] = { """ for machine in machines: tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py deleted file mode 100644 index 051ac268d..000000000 --- a/volk/include/volk/make_mktables.py +++ /dev/null @@ -1,33 +0,0 @@ - - -def make_mktables(funclist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstrgin = tempstring + '#include\n'; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + 'int main() {\n'; - tempstring = tempstring + ' int i = 0;\n'; - tempstring = tempstring + ' FILE* output;\n'; - tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; - tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; - tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; - - for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; - tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; - tempstring = tempstring + ' fclose(output);\n' - tempstring = tempstring + '}\n'; - return tempstring; - - - - - - - - diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py deleted file mode 100644 index d468487d7..000000000 --- a/volk/include/volk/make_runtime.py +++ /dev/null @@ -1,27 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - - - -def make_runtime(funclist, arglist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + emit_prolog(); - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "extern void (*" + funclist[i] + ")(" + arglist[i] + ");\n" - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py deleted file mode 100644 index 99cdf395f..000000000 --- a/volk/include/volk/make_runtime_c.py +++ /dev/null @@ -1,47 +0,0 @@ -from xml.dom import minidom -import string -from volk_regexp import * - - -def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + "#include\n"; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - - for func in funclist: - tempstring = tempstring + "#include\n" ; - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - - tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; - tags_counter = tags_counter + 1; - - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); - tempstring = tempstring + "};\n\n"; - - - tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; - tempstring = tempstring + " %s = %s_archs[volk_rank_archs(%s_arch_defs, volk_get_lvarch())];\n" % (funclist[i], funclist[i], funclist[i]) - tempstring = tempstring + " %s(%s);\n}\n\n" % (funclist[i], my_arglist[i]) - - tempstring = tempstring + "%s(*%s)(%s) = &default_acquire_%s;\n\n" % (retlist[i], funclist[i], my_argtypelist[i], funclist[i]) - - return tempstring; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 9d33abe89..437fb33b1 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -9,38 +9,29 @@ from make_cpuid_c import make_cpuid_c from make_cpuid_h import make_cpuid_h from make_set_simd import make_set_simd from make_registry import make_registry -from make_h import make_h -from make_init_h import make_init_h from make_config_fixed import make_config_fixed -from make_config_in import make_config_in -from make_c import make_c -from make_runtime_c import make_runtime_c -from make_init_c import make_init_c -from make_runtime import make_runtime from make_typedefs import make_typedefs from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h -from make_mktables import make_mktables from make_makefile_am import make_makefile_am from make_machines_h import make_machines_h from make_machines_c import make_machines_c from make_each_machine_c import make_each_machine_c +from make_c import make_c +from make_h import make_h import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); outfile_h = open("volk.h", "w"); outfile_c = open("../../lib/volk.c", "w"); -outfile_runtime = open("volk_runtime.h", "w"); -outfile_runtime_c = open("../../lib/volk_runtime.c", "w"); outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); -outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); #outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); -outfile_mktables = open("../../lib/volk_mktables.c", "w"); +#outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); @@ -268,46 +259,16 @@ outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs, machines)); outfile_set_simd.close(); - -#outfile_config_in.write(make_config_in(filearchs)); -#outfile_config_in.close(); outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); -outfile_h.write(make_h(functions, arched_arglist, retlist)); -outfile_h.close(); - -outfile_init_h.write(make_init_h(functions, arched_arglist, retlist)); -outfile_init_h.close(); - outfile_config_fixed.write(make_config_fixed(filearchs)); outfile_config_fixed.close(); -outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); -outfile_c.close(); - -outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist)); -outfile_runtime_c.close(); - -outfile_init_c.write(make_init_c(functions, filearchs)); -outfile_init_c.close(); - -outfile_runtime.write(make_runtime(functions, my_argtypelist)); -outfile_runtime.close(); - outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); outfile_typedefs.close(); -outfile_environment_c.write(make_environment_init_c(filearchs)); -outfile_environment_c.close(); - -outfile_environment_h.write(make_environment_init_h()); -outfile_environment_h.close(); - -outfile_mktables.write(make_mktables(functions)); -outfile_mktables.close(); - outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() @@ -317,6 +278,12 @@ outfile_machines_h.close() outfile_machines_c.write(make_machines_c(machines)) outfile_machines_c.close() +outfile_c.write(make_c(machines, archs, functions, arched_arglist, my_arglist)) +outfile_c.close() + +outfile_h.write(make_h(functions)) +outfile_h.close() + for machine in machines: machine_c_filename = "../../lib/volk_machine_" + machine + ".c" outfile_machine_c = open(machine_c_filename, "w") -- cgit From d59273f379d7ab997e50c1dd8b0ac3dfc7bd1e33 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 22:48:13 -0700 Subject: Volk: lib compiles & links & works for hand test. testsuite doesn't due to lack of _manual() capability. --- volk/include/volk/Makefile.am | 18 ------------ volk/include/volk/machines.xml | 2 ++ volk/include/volk/make_c.py | 14 +++++----- volk/include/volk/make_each_machine_c.py | 6 ++-- volk/include/volk/make_machines_c.py | 6 ++-- volk/include/volk/make_machines_h.py | 4 +-- volk/include/volk/make_makefile_am.py | 48 +++++++++++++------------------- volk/include/volk/volk_register.py | 6 ++-- 8 files changed, 39 insertions(+), 65 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index caf707138..876bb7a60 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -25,15 +25,10 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk -BUILT_SOURCES: \ - volk_tables.h - volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ - volk_runtime.h \ - volk_tables.h \ volk_typedefs.h \ volk_registry.h \ volk.h \ @@ -128,19 +123,6 @@ volkinclude_HEADERS = \ volk_8i_s32f_convert_32f_a16.h \ volk_8i_s32f_convert_32f_u.h -VOLK_MKTABLES_SOURCES = \ - $(platform_CODE) \ - $(top_srcdir)/lib/volk_rank_archs.c \ - $(top_srcdir)/lib/volk_mktables.c \ - $(top_srcdir)/lib/volk_cpu.c - - -volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include - -volk_tables.h: volk_mktables$(EXEEXT) - ./volk_mktables$(EXEEXT) - distclean-local: rm -f volk_config_fixed.h rm -f volk_cpu.h diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml index 8eed9e8d7..ad71da177 100644 --- a/volk/include/volk/machines.xml +++ b/volk/include/volk/machines.xml @@ -4,6 +4,7 @@ generic + generic 32|64 mmx sse sse2 diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 5f6669e40..7cb0616c8 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -28,13 +28,13 @@ def make_c(machines, archs, functions, arched_arglist, my_arglist): // Do not edit this file. """ tempstring += """ -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include "volk_rank_archs.h" +#include #include """ diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index abf4bb2d5..866e18bc3 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -31,9 +31,9 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" tempstring += """ -#include -#include -#include +#include +#include +#include """ for func in functions: diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py index 12eabf98d..9ad56fb62 100644 --- a/volk/include/volk/make_machines_c.py +++ b/volk/include/volk/make_machines_c.py @@ -22,9 +22,9 @@ def make_machines_c(machines): // This file is automatically generated by make_machines_c.py. // Do not edit this file. -#include -#include -#include +#include +#include +#include struct volk_machine volk_machines[] = { """ diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py index 68cdf3363..674ee12cd 100644 --- a/volk/include/volk/make_machines_h.py +++ b/volk/include/volk/make_machines_h.py @@ -25,8 +25,8 @@ def make_machines_h(functions, machines): #ifndef INCLUDED_LIBVOLK_MACHINES_H #define INCLUDED_LIBVOLK_MACHINES_H -#include -#include +#include +#include struct volk_machine { const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 1c4c8fe7d..2c9ef8fb6 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -24,20 +24,14 @@ def make_makefile_am(dom, machines, archflags_dict): include $(top_srcdir)/Makefile.common -#FIXME: forcing the top_builddir for distcheck seems like a bit -# of a hack. Figure out the right way to do this to find built -# volk_config.h and volk_tables.h - AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ -I$(top_builddir)/include \ $(WITH_INCLUDES) lib_LTLIBRARIES = \ - libvolk.la \ - libvolk_runtime.la + libvolk.la EXTRA_DIST = \ - volk_mktables.c \ volk_rank_archs.h \ volk_proccpu_sim.c \ gcc_x86_cpuid.h @@ -46,16 +40,14 @@ EXTRA_DIST = \ # The main library # ---------------------------------------------------------------- -libvolk_runtime_la_SOURCES = \ +libvolk_la_SOURCES = \ $(platform_CODE) \ - volk_runtime.c \ + volk.cc \ + volk_cpu.c \ volk_rank_archs.c \ - volk_cpu.c + volk_machines.cc -libvolk_la_SOURCES = \ - $(platform_CODE) \ - volk.c \ - volk_environment_init.c +libvolk_la_LDFLAGS = volk_orc_LDFLAGS = \ $(ORC_LDFLAGS) \ @@ -66,31 +58,31 @@ volk_orc_LIBADD = \ """ - #here be dragons + #here be dragons + tempstring += "libvolk_la_LIBADD = \n" + tempstring += "libvolk_la_CPPFLAGS = $(AM_CPPFLAGS)\n" + tempstring += "noinst_LTLIBRARIES = \n" for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_ar_LDFLAGS = " + tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " -# tempstring += "\nlibvolk_" + machine_name + "_ar_CFLAGS = " -# for arch in machines[machine_name]: -# tempstring += "-DLV_HAVE_" + arch.swapcase() + " " - tempstring += "\nlibvolk_" + machine_name + "_ar_SOURCES = libvolk_machine_" + machine_name + ".cc" - tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".ar" + tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " + tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" + tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " " tempstring += "\nendif\n" tempstring += """ if LV_HAVE_ORC -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD = $(volk_orc_LIBADD) +libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD += $(volk_orc_LIBADD) else -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_la_LIBADD = +libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_la_LIBADD += endif @@ -125,7 +117,6 @@ testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) if LV_HAVE_ORC testqa_LDADD = \ libvolk.la \ - libvolk_runtime.la \ ../orc/libvolk_orc.la else testqa_LDADD = \ @@ -141,7 +132,6 @@ distclean-local: rm -f volk_init.h rm -f volk_mktables.c rm -f volk_proccpu_sim.c - rm -f volk_runtime.c rm -f volk_tables.h rm -f volk_environment_init.c """ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 437fb33b1..a319e9dad 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -24,7 +24,7 @@ import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); outfile_h = open("volk.h", "w"); -outfile_c = open("../../lib/volk.c", "w"); +outfile_c = open("../../lib/volk.cc", "w"); outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); @@ -36,7 +36,7 @@ outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); outfile_machines_h = open("volk_machines.h", "w"); -outfile_machines_c = open("../../lib/volk_machines.c", "w"); +outfile_machines_c = open("../../lib/volk_machines.cc", "w"); infile = open("Makefile.am", "r"); @@ -285,7 +285,7 @@ outfile_h.write(make_h(functions)) outfile_h.close() for machine in machines: - machine_c_filename = "../../lib/volk_machine_" + machine + ".c" + machine_c_filename = "../../lib/volk_machine_" + machine + ".cc" outfile_machine_c = open(machine_c_filename, "w") outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) outfile_machine_c.close() -- cgit From 7c48568abc3f0aa918e199f68acca5740250ee5d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 23:04:29 -0700 Subject: Volk: hack Orc back in --- volk/include/volk/make_makefile_am.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 2c9ef8fb6..bd6283b58 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -65,7 +65,7 @@ volk_orc_LIBADD = \ for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " @@ -80,9 +80,11 @@ volk_orc_LIBADD = \ if LV_HAVE_ORC libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) libvolk_la_LIBADD += $(volk_orc_LIBADD) +volk_orc_CFLAGS = -DLV_HAVE_ORC else libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 libvolk_la_LIBADD += +volk_orc_CFLAGS = endif -- cgit From 8b04d29221719239b52a300a338513f05746ed7f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 10:05:43 -0700 Subject: Volk: split n_archs out of arch_defs[0], began to add _manual support --- volk/include/volk/make_c.py | 9 ++++++++- volk/include/volk/make_each_machine_c.py | 2 +- volk/include/volk/make_registry.py | 33 +++++++++++++++++--------------- volk/include/volk/volk_common.h | 1 + 4 files changed, 28 insertions(+), 17 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 7cb0616c8..1129285bb 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -66,13 +66,20 @@ struct volk_machine *get_machine(void) { printf("Using Volk machine: %s\n", machine->name); return machine; } +} + +static unsigned int get_index(const char **indices, char *arch_name) { + + + + } """ for i in range(len(functions)): tempstring += "void get_" + functions[i] + replace_arch.sub("", arched_arglist[i]) + "\n" - tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i]) + tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, get_machine()->%s_desc.n_archs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i], functions[i]) tempstring += " %s(%s);\n}\n\n" % (functions[i], my_arglist[i]) tempstring += replace_volk.sub("p", functions[i]) + " " + functions[i] + " = &get_" + functions[i] + ";\n\n" diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 866e18bc3..11f669566 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -71,7 +71,7 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tempstring += " name,\n" for i in range(len(functions)): - tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs },\n" + tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs, " + functions[i] + "_n_archs },\n" tempstring += " " + functions[i] + "_archs,\n" tempstring = strip_trailing(tempstring, ",") diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 7fbe9a8b0..de1f46aa6 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -1,5 +1,6 @@ from xml.dom import minidom from emit_omnilog import * +from volk_regexp import * import string def make_registry(dom, funclist, fcountlist, taglist) : @@ -39,24 +40,13 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; tags_counter = tags_counter + 1; - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = strip_trailing(tempstring, ",") tempstring = tempstring + "};\n\n"; for fcount in fcountlist: tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; - counter = counter + 1; - for arch_list in fcount: - tempstring = tempstring + " (LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + "_CNT"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " * LV_"; - tempstring = tempstring + ") + "; - lindex = tempstring.rfind(" + "); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); - tempstring = tempstring + ",\n" + counter += 1; for arch_list in fcount: tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): @@ -70,9 +60,22 @@ def make_registry(dom, funclist, fcountlist, taglist) : if ind < len(arch_list) - 1: tempstring = tempstring + ") + (1 << LV_" tempstring = tempstring + "),\n#endif\n" - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = strip_trailing(tempstring, ",") tempstring = tempstring + "};\n\n" + + counter = 0; + for fcount in fcountlist: + tempstring += "static const int " + funclist[counter] + "_n_archs = " + counter += 1; + for arch_list in fcount: + tempstring = tempstring + " (LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind] + "_CNT"; + if ind < len(arch_list) - 1: + tempstring = tempstring + " * LV_"; + tempstring = tempstring + ") + "; + tempstring = strip_trailing(tempstring, " + "); + tempstring = tempstring + ";\n" tempstring = tempstring + emit_epilog(); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index e050600f0..0a47ff864 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -18,6 +18,7 @@ union bit128{ struct volk_func_desc { const char **indices; const int *arch_defs; + const int n_archs; }; #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From 668da8bd8874ae71a819d55f046e39c964a4270b Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 10:47:10 -0700 Subject: Volk: manual funcs implemented, QA code runs. Barfs due to missing Orc arch. --- volk/include/volk/make_c.py | 20 ++++++++++++++------ volk/include/volk/make_each_machine_c.py | 2 +- volk/include/volk/make_h.py | 7 +++++-- volk/include/volk/volk_register.py | 2 +- 4 files changed, 21 insertions(+), 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 1129285bb..ee9526860 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -36,6 +36,7 @@ def make_c(machines, archs, functions, arched_arglist, my_arglist): #include "volk_rank_archs.h" #include #include +#include """ tempstring += emit_prolog(); @@ -68,11 +69,15 @@ struct volk_machine *get_machine(void) { } } -static unsigned int get_index(const char **indices, char *arch_name) { - - - - +static unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { + int i; + for(i=0; i -generic 32|64 mmx sse sse2 +generic orc 32|64 mmx sse sse2 -generic 32|64 mmx sse sse2 sse3 +generic orc 32|64 mmx sse sse2 sse3 -generic 32|64 mmx sse sse2 sse3 ssse3 +generic orc 32|64 mmx sse sse2 sse3 ssse3 -generic 32|64 mmx sse sse2 sse3 sse4_a popcount +generic orc 32|64 mmx sse sse2 sse3 sse4_a popcount -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_1 -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx -generic altivec +generic orc altivec diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index ee9526860..591e8b64c 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -77,7 +77,8 @@ static unsigned int get_index(const char *indices[], unsigned int n_archs, const } } //something terrible should happen here - return 0; //but we'll fake it for now + printf("Volk warning: no arch found, returning generic impl\n"); + return get_index(indices, n_archs, "generic"); //but we'll fake it for now } """ diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 2b0bf1f48..4efbe9e8c 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -36,12 +36,11 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): #include """ + tempstring += emit_prolog(); for func in functions: tempstring += "#include \n" tempstring += "\n\n" - tempstring += emit_prolog(); - for i in range(len(functions)): tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index bd6283b58..eaff7f3c8 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -47,46 +47,52 @@ libvolk_la_SOURCES = \ volk_rank_archs.c \ volk_machines.cc + libvolk_la_LDFLAGS = +libvolk_la_LIBADD = -volk_orc_LDFLAGS = \ - $(ORC_LDFLAGS) \ - -lorc-0.4 +if LV_HAVE_ORC +volk_orc_CFLAGS = -DLV_HAVE_ORC=1 +volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 +volk_orc_LIBADD = ../orc/libvolk_orc.la +else +volk_orc_CFLAGS = +volk_orc_LDFLAGS = +volk_orc_LIBADD = +endif -volk_orc_LIBADD = \ - ../orc/libvolk_orc.la +libvolk_la_CPPFLAGS = $(AM_CPPFLAGS) $(volk_orc_CFLAGS) +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) + +noinst_LTLIBRARIES = """ #here be dragons - tempstring += "libvolk_la_LIBADD = \n" - tempstring += "libvolk_la_CPPFLAGS = $(AM_CPPFLAGS)\n" - tempstring += "noinst_LTLIBRARIES = \n" for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " + #tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" - tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " " - tempstring += "\nendif\n" + tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" + #tempstring += "if LV_HAVE_ORC\n" + #tempstring += "libvolk_" + machine_name + "_la_LIBADD = $(volk_orc_LIBADD)\n" + #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = $(volk_orc_LDFLAGS)\n" + #tempstring += "else\n" + #tempstring += "libvolk_" + machine_name + "_la_LIBADD = \n" + #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = \n" + #tempstring += "endif\n" + tempstring += "endif\n" - tempstring += """ -if LV_HAVE_ORC -libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD += $(volk_orc_LIBADD) -volk_orc_CFLAGS = -DLV_HAVE_ORC -else -libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_la_LIBADD += -volk_orc_CFLAGS = -endif - + tempstring += r""" # ---------------------------------------------------------------- # The QA library. Note libvolk.la in LIBADD @@ -116,14 +122,8 @@ noinst_PROGRAMS = \ testqa_SOURCES = testqa.cc qa_utils.cc testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) -if LV_HAVE_ORC -testqa_LDADD = \ - libvolk.la \ - ../orc/libvolk_orc.la -else testqa_LDADD = \ libvolk.la -endif distclean-local: rm -f volk.c -- cgit From cd0cc571d90fd2e0b8c9376ce07150db5f405fb7 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 15:38:57 -0700 Subject: Volk: Compiles and runs with or without Orc installed. --- volk/include/volk/machines.xml | 16 ++++++++-------- volk/include/volk/make_each_machine_c.py | 10 +++++----- volk/include/volk/make_makefile_am.py | 7 ------- 3 files changed, 13 insertions(+), 20 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml index 9bf838527..ad71da177 100644 --- a/volk/include/volk/machines.xml +++ b/volk/include/volk/machines.xml @@ -15,35 +15,35 @@ --> -generic orc 32|64 mmx sse sse2 +generic 32|64 mmx sse sse2 -generic orc 32|64 mmx sse sse2 sse3 +generic 32|64 mmx sse sse2 sse3 -generic orc 32|64 mmx sse sse2 sse3 ssse3 +generic 32|64 mmx sse sse2 sse3 ssse3 -generic orc 32|64 mmx sse sse2 sse3 sse4_a popcount +generic 32|64 mmx sse sse2 sse3 sse4_a popcount -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_1 +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx -generic orc altivec +generic altivec diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 4efbe9e8c..94d6d7789 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -46,12 +46,12 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tags_counter = 0 for arch_list in fcountlist[i]: - ok = True + tempstring += "#if " for arch in arch_list: - if arch.swapcase() not in archs: - ok = False - if ok: - tempstring += " " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tempstring += "defined(LV_HAVE_" + arch + ") && " + tempstring = strip_trailing(tempstring, " && ") + tempstring += "\n " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tempstring += "#endif\n" tags_counter += 1 tempstring = strip_trailing(tempstring, ",") diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index eaff7f3c8..c73730859 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -82,13 +82,6 @@ noinst_LTLIBRARIES = tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" - #tempstring += "if LV_HAVE_ORC\n" - #tempstring += "libvolk_" + machine_name + "_la_LIBADD = $(volk_orc_LIBADD)\n" - #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = $(volk_orc_LDFLAGS)\n" - #tempstring += "else\n" - #tempstring += "libvolk_" + machine_name + "_la_LIBADD = \n" - #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = \n" - #tempstring += "endif\n" tempstring += "endif\n" -- cgit From 513020d87912569691601cc2c49ca3331959fa63 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 23:05:09 -0700 Subject: Volk: modified archs.xml to put Orc higher prio than old SSE/SSE2, since Orc gives better results than those platforms on avg Test cases changed to take ~1s each on my i7 laptop --- volk/include/volk/archs.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index 2547df590..977cc7924 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -5,12 +5,6 @@ none - - lorc-0.4 - LV_HAVE_ORC - no - - maltivec @@ -83,6 +77,12 @@ msse2 + + lorc-0.4 + LV_HAVE_ORC + no + + 1 1 -- cgit From a484de90ccf45ca7029497a66d089bd4c781c40f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sun, 17 Apr 2011 16:11:11 -0700 Subject: Volk: make_makefile_am.py changes for include dirs --- volk/include/volk/Makefile.am | 1 + volk/include/volk/make_makefile_am.py | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 876bb7a60..9a665f59e 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -33,6 +33,7 @@ volkinclude_HEADERS = \ volk_registry.h \ volk.h \ volk_cpu.h \ + volk_machines.h \ volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index c73730859..1e43634f6 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -47,10 +47,6 @@ libvolk_la_SOURCES = \ volk_rank_archs.c \ volk_machines.cc - -libvolk_la_LDFLAGS = -libvolk_la_LIBADD = - if LV_HAVE_ORC volk_orc_CFLAGS = -DLV_HAVE_ORC=1 volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 @@ -73,8 +69,7 @@ noinst_LTLIBRARIES = for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " - #tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_srcdir)/include $(volk_orc_CFLAGS) " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " -- cgit From 208dcc510e6b6beedf2479a598a90d32c19e1274 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 17:29:45 -0700 Subject: volk: updated lib and include .gitignores for in-tree build --- volk/include/volk/.gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index be8358f3a..1afe439ba 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -9,12 +9,9 @@ /Makefile /Makefile.in /volk.h -/volk_config.h /volk_config_fixed.h /volk_cpu.h /volk_environment_init.h /volk_registry.h -/volk_runtime.h -/volk_tables.h /volk_typedefs.h -/volk_mktables +/volk_machines.h -- cgit From a56b291edf063604ea40391f40771eda99d8eb40 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 17:39:15 -0700 Subject: volk: added attributes header (copied from gruel) --- volk/include/volk/Makefile.am | 1 + volk/include/volk/make_h.py | 1 + volk/include/volk/volk_attributes.h | 56 +++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 volk/include/volk/volk_attributes.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9a665f59e..a4440a980 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,6 +26,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ + volk_attributes.h \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index bf449719d..81928e6b5 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -12,6 +12,7 @@ def make_h(funclist, arched_arglist) : tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + emit_prolog(); diff --git a/volk/include/volk/volk_attributes.h b/volk/include/volk/volk_attributes.h new file mode 100644 index 000000000..5345c1535 --- /dev/null +++ b/volk/include/volk/volk_attributes.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_ATTRIBUTES_H +#define INCLUDED_VOLK_ATTRIBUTES_H + +//////////////////////////////////////////////////////////////////////// +// Cross-platform attribute macros +//////////////////////////////////////////////////////////////////////// +#if defined __GNUC__ +# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +# define __VOLK_ATTR_UNUSED __attribute__((unused)) +# define __VOLK_ATTR_INLINE __attribute__((always_inline)) +# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# if __GNUC__ >= 4 +# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +# else +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +# endif +#elif _MSC_VER +# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE __forceinline +# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +# define __VOLK_ATTR_EXPORT __declspec(dllexport) +# define __VOLK_ATTR_IMPORT __declspec(dllimport) +#else +# define __VOLK_ATTR_ALIGNED(x) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE +# define __VOLK_ATTR_DEPRECATED +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +#endif + +#endif /* INCLUDED_VOLK_ATTRIBUTES_H */ -- cgit From 0b3e4f25eea1eef3e8a45fdb6d1bcc1ec57d1321 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 23:03:18 -0700 Subject: volk: replace references to __attribute__((aligned... with cross platform macro This replaces all aligned(16) and aligned(128) (which were incorrect). Attribute macros are on the left because this makes gcc and ms compilers happy. --- volk/include/volk/volk_16ic_magnitude_16i_a16.h | 9 +++++---- volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 3 ++- volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h | 3 ++- volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 3 ++- volk/include/volk/volk_32f_index_max_16u_a16.h | 9 +++++---- .../volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 2 +- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 2 +- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 2 +- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 9 +++++---- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 7 ++++--- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 6 +++--- volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 3 ++- volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h | 3 ++- volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h | 5 +++-- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 5 +++-- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 2 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 3 ++- volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 3 ++- volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 3 ++- 25 files changed, 63 insertions(+), 43 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index e75d54ec4..00d29b112 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H #define INCLUDED_volk_16ic_magnitude_16i_a16_H +#include #include #include #include @@ -25,8 +26,8 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co __m128 cplxValue1, cplxValue2, result; - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ @@ -96,8 +97,8 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con __m128 cplxValue1, cplxValue2, iValue, qValue, result; - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index dcb2499fa..a4f0689e5 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#include #include #include @@ -25,7 +26,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index f21fe77f8..564aa1f5d 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#include #include #include @@ -72,7 +73,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, __m128 invScalar = _mm_set_ps1(iScalar); int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index 388d2ebcd..637ba9fd0 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#include #include #include #include @@ -25,7 +26,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, __m128 cplxValue1, cplxValue2, result; - float inputFloatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; for(;number < quarterPoints; number++){ @@ -91,7 +92,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, __m128 cplxValue1, cplxValue2, result, re, im; - float inputFloatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; for(;number < quarterPoints; number++){ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 6a85e066e..94aff3a49 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H #define INCLUDED_volk_32f_accumulator_s32f_a16_H +#include #include #include @@ -18,7 +19,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* const unsigned int quarterPoints = num_points / 4; const float* aPtr = inputBuffer; - float tempBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index 3934d2db7..5c19bfca0 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_index_max_16u_a16_H #define INCLUDED_volk_32f_index_max_16u_a16_H +#include #include #include #include @@ -25,8 +26,8 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const __m128 compareResults; __m128 currentValues; - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; for(;number < quarterPoints; number++){ @@ -83,8 +84,8 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl __m128 compareResults; __m128 currentValues; - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 55d4e0319..70ab3ccdb 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#include #include #include @@ -21,7 +22,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no const unsigned int quarterPoints = num_points / 4; const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; __m128 dataPointsVal; __m128 avgPointsVal = _mm_setzero_ps(); @@ -87,7 +88,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no sumMean += avgPointsVector[3]; // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; _mm_store_ps(validBinCountVector, vValidBinCount); float validBinCount = 0; diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index 9d1d0ef4d..71b53ba3a 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H #define INCLUDED_volk_32f_s32f_convert_16i_a16_H +#include #include #include @@ -63,7 +64,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, cons __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index 06228ef7d..dec3f1611 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 82c74bf44..095d7bd35 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H #define INCLUDED_volk_32f_s32f_convert_32i_a16_H +#include #include #include @@ -59,7 +60,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, cons __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index 253a48ae3..b4e954dc4 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -61,7 +61,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index 8dab0cdf4..509a46609 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H #define INCLUDED_volk_32f_s32f_convert_8i_a16_H +#include #include #include @@ -70,7 +71,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 72b193c9d..1c6bf87c9 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -72,7 +72,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 48d2fe1fe..779ae2d39 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H #define INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#include #include #include #include @@ -22,7 +23,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa const float* aPtr = inputBuffer; - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 squareAccumulator = _mm_setzero_ps(); __m128 aVal1, aVal2, aVal3, aVal4; @@ -82,7 +83,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* const float* aPtr = inputBuffer; - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 squareAccumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index f1cb2ae0e..9605322d3 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#include #include #include #include @@ -22,8 +23,8 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 squareAccumulator = _mm_setzero_ps(); @@ -95,8 +96,8 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* const unsigned int quarterPoints = num_points / 4; const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 squareAccumulator = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index d13f12e51..93151260f 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H #define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#include #include @@ -53,7 +54,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -102,7 +103,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -163,7 +164,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const flo dotProdVal = _mm_add_ps(dotProdVal, cVal1); } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 7c1136a67..7f47122ff 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -102,7 +102,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -163,7 +163,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float dotProdVal = _mm_add_ps(dotProdVal, cVal1); } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index e98735245..cab3db50d 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#include #include #include @@ -85,7 +86,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ iValue = _mm_load_ps(iBufferPtr); diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 1e3e61e08..304515a5c 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#include #include #include @@ -24,7 +25,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer __m128 cplxValue1, cplxValue2, iValue; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 14318ab01..96afa5ae9 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#include #include #include #include @@ -25,7 +26,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto __m128 cplxValue1, cplxValue2, result; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); @@ -80,7 +81,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector __m128 cplxValue1, cplxValue2, iValue, qValue, result; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index d78faf5b5..78e28c903 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#include #include #include @@ -64,7 +65,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -205,7 +206,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul #if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; int bound = num_bytes >> 4; int leftovers = num_bytes % 16; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 69781f0fb..73576a766 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -66,7 +66,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; union HalfMask { uint32_t intRep[4]; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index b7b9768ab..d404ee684 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#include #include #include #include @@ -358,7 +359,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const b += 2; } - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index 80032d2fe..d68d2462a 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#include #include #include @@ -95,7 +96,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo __m128 invScalar = _mm_set_ps1(1.0/scalar); int8_t* complexVectorPtr = (int8_t*)complexVector; - float floatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(complexVectorPtr[0]); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index 47a968ac1..d2cfa42f6 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#include #include #include @@ -81,7 +82,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c __m128 invScalar = _mm_set_ps1(iScalar); int8_t* complexVectorPtr = (int8_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; -- cgit From b31f891208f8a1c3605f4db2297cc6b6f4b37aca Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 19:37:44 -0700 Subject: volk: added VOLK_API macro to external symbols --- volk/include/volk/make_h.py | 14 +++++++++++--- volk/include/volk/make_makefile_am.py | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 81928e6b5..07e62939b 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -2,7 +2,14 @@ from xml.dom import minidom from emit_omnilog import * from volk_regexp import * - +# http://gcc.gnu.org/wiki/Visibility +volk_api_defines = """ +#ifdef volk_EXPORTS +# define VOLK_API __VOLK_ATTR_EXPORT +#else +# define VOLK_API __VOLK_ATTR_IMPORT +#endif +""" def make_h(funclist, arched_arglist) : tempstring = ""; @@ -14,15 +21,16 @@ def make_h(funclist, arched_arglist) : tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; + tempstring = tempstring + volk_api_defines tempstring = tempstring + emit_prolog(); tempstring = tempstring + '\n'; for i in range(len(funclist)): tempstring += "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" - tempstring += "extern void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) + tempstring += "extern VOLK_API void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) tempstring = strip_trailing(tempstring, " {") - tempstring += "extern struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) + tempstring += "extern VOLK_API struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) tempstring = tempstring + emit_epilog(); tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 1e43634f6..af4467f37 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -26,6 +26,8 @@ include $(top_srcdir)/Makefile.common AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ -I$(top_builddir)/include \ + -Dvolk_EXPORTS \ + -fvisibility=hidden \ $(WITH_INCLUDES) lib_LTLIBRARIES = \ -- cgit From de771b0303298e7761f3c5350323565d05c5ee6d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Mon, 18 Apr 2011 10:11:57 -0700 Subject: Volk: remove some dead files from Makefile.am --- volk/include/volk/make_makefile_am.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index af4467f37..2ff492b34 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -35,7 +35,6 @@ lib_LTLIBRARIES = \ EXTRA_DIST = \ volk_rank_archs.h \ - volk_proccpu_sim.c \ gcc_x86_cpuid.h # ---------------------------------------------------------------- -- cgit From d941ba31677804fe382d76fca17fc044d12777f5 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 26 Apr 2011 21:41:34 -0700 Subject: volk: removed cppunit from the build (not used anymore) --- volk/include/volk/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index a4440a980..b36d96c5d 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -20,7 +20,7 @@ include $(top_srcdir)/Makefile.common -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ $(LV_CXXFLAGS) $(WITH_INCLUDES) volkincludedir = $(prefix)/include/volk -- cgit From a5e2d9e5baf869ae961fbb5820447290d6d9c7c8 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 26 Apr 2011 21:55:48 -0700 Subject: volk: reorganization of generation sources and generated files All generation sources have been moved to the gen/ subdirectory. Bootstrap and volk_register.py generate the files into to gen/ subdirectory in an effort to cleanly separate the static/generated parts of the build tree. Define top_gendir in Makefile.common, all generated sources listed in Makefile.ams are prefixed with $(top_gendir) to differentiate them from static in-tree sources. --- volk/include/volk/.gitignore | 15 -- volk/include/volk/Makefile.am | 28 +-- volk/include/volk/archs.xml | 137 ------------- volk/include/volk/emit_omnilog.py | 13 -- volk/include/volk/machines.xml | 49 ----- volk/include/volk/make_c.py | 100 --------- volk/include/volk/make_config_fixed.py | 21 -- volk/include/volk/make_config_in.py | 13 -- volk/include/volk/make_cpuid_c.py | 184 ----------------- volk/include/volk/make_cpuid_h.py | 48 ----- volk/include/volk/make_each_machine_c.py | 82 -------- volk/include/volk/make_environment_init_c.py | 32 --- volk/include/volk/make_environment_init_h.py | 18 -- volk/include/volk/make_h.py | 39 ---- volk/include/volk/make_machines_c.py | 57 ------ volk/include/volk/make_machines_h.py | 50 ----- volk/include/volk/make_makefile_am.py | 131 ------------ volk/include/volk/make_proccpu_sim.py | 47 ----- volk/include/volk/make_registry.py | 85 -------- volk/include/volk/make_set_simd.py | 166 --------------- volk/include/volk/make_typedefs.py | 23 --- volk/include/volk/volk_regexp.py | 14 -- volk/include/volk/volk_register.py | 291 --------------------------- 23 files changed, 7 insertions(+), 1636 deletions(-) delete mode 100644 volk/include/volk/archs.xml delete mode 100644 volk/include/volk/emit_omnilog.py delete mode 100644 volk/include/volk/machines.xml delete mode 100644 volk/include/volk/make_c.py delete mode 100644 volk/include/volk/make_config_fixed.py delete mode 100644 volk/include/volk/make_config_in.py delete mode 100644 volk/include/volk/make_cpuid_c.py delete mode 100644 volk/include/volk/make_cpuid_h.py delete mode 100644 volk/include/volk/make_each_machine_c.py delete mode 100644 volk/include/volk/make_environment_init_c.py delete mode 100644 volk/include/volk/make_environment_init_h.py delete mode 100644 volk/include/volk/make_h.py delete mode 100644 volk/include/volk/make_machines_c.py delete mode 100644 volk/include/volk/make_machines_h.py delete mode 100644 volk/include/volk/make_makefile_am.py delete mode 100644 volk/include/volk/make_proccpu_sim.py delete mode 100644 volk/include/volk/make_registry.py delete mode 100644 volk/include/volk/make_set_simd.py delete mode 100644 volk/include/volk/make_typedefs.py delete mode 100644 volk/include/volk/volk_regexp.py delete mode 100755 volk/include/volk/volk_register.py (limited to 'volk/include') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index 1afe439ba..b336cc7ce 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -1,17 +1,2 @@ -/*.cache -/*.la -/*.lo -/*.pc -/.deps -/.la -/.libs -/.lo /Makefile /Makefile.in -/volk.h -/volk_config_fixed.h -/volk_cpu.h -/volk_environment_init.h -/volk_registry.h -/volk_typedefs.h -/volk_machines.h diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index b36d96c5d..ea60d201a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -29,13 +29,13 @@ volkinclude_HEADERS = \ volk_attributes.h \ volk_complex.h \ volk_common.h \ - volk_config_fixed.h \ - volk_typedefs.h \ - volk_registry.h \ - volk.h \ - volk_cpu.h \ - volk_machines.h \ - volk_environment_init.h \ + $(top_gendir)/include/volk/volk_config_fixed.h \ + $(top_gendir)/include/volk/volk_typedefs.h \ + $(top_gendir)/include/volk/volk_registry.h \ + $(top_gendir)/include/volk/volk.h \ + $(top_gendir)/include/volk/volk_cpu.h \ + $(top_gendir)/include/volk/volk_machines.h \ + $(top_gendir)/include/volk/volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ volk_16ic_deinterleave_16i_x2_a16.h \ @@ -124,17 +124,3 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a16.h \ volk_8i_s32f_convert_32f_u.h - -distclean-local: - rm -f volk_config_fixed.h - rm -f volk_cpu.h - rm -f volk.h - rm -f volk_registry.h - rm -f volk_runtime.h - rm -f volk_typedefs.h - rm -f volk_tables.h - rm -f *.pyc - rm -f Makefile.in - rm -f volk_environment_init.h - rm -f volk_mktables - rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml deleted file mode 100644 index 977cc7924..000000000 --- a/volk/include/volk/archs.xml +++ /dev/null @@ -1,137 +0,0 @@ - - - - - none - - - - maltivec - - - - m32 - MD_SUBCPU - x86_64 - - - - 0x80000001 - d - 29 - m64 - 1 - MD_SUBCPU - x86 - - - - 0x80000001 - d - 31 - m3dnow - 1 - - - - 1 - 0x80000001 - d - 5 - sse4.2 - - - - 1 - 1 - c - 23 - mpopcnt - - - - 1 - 1 - d - 23 - mmmx - - - - - 1 - 1 - d - 25 - msse - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - xmmintrin.h - - - - - 1 - 1 - d - 26 - msse2 - - - - lorc-0.4 - LV_HAVE_ORC - no - - - - 1 - 1 - c - 0 - msse3 - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); - pmmintrin.h - - - - 1 - 1 - c - 9 - mssse3 - - - - 1 - 0x80000001 - c - 6 - msse4a - - - - - 1 - 1 - c - 19 - msse4.1 - - - - 1 - 1 - c - 20 - msse4.2 - - - - 1 - 1 - c - 28 - mavx - - - diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py deleted file mode 100644 index 309d7e578..000000000 --- a/volk/include/volk/emit_omnilog.py +++ /dev/null @@ -1,13 +0,0 @@ -def emit_prolog(): - tempstring = ""; - tempstring = tempstring + '#ifdef __cplusplus\n'; - tempstring = tempstring + 'extern "C" {\n'; - tempstring = tempstring + '#endif\n'; - return tempstring; -def emit_epilog(): - tempstring = ""; - tempstring = tempstring + '#ifdef __cplusplus\n'; - tempstring = tempstring + '}\n'; - tempstring = tempstring + '#endif\n'; - return tempstring; - diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml deleted file mode 100644 index ad71da177..000000000 --- a/volk/include/volk/machines.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - -generic - - - - - -generic 32|64 mmx sse sse2 - - - -generic 32|64 mmx sse sse2 sse3 - - - -generic 32|64 mmx sse sse2 sse3 ssse3 - - - -generic 32|64 mmx sse sse2 sse3 sse4_a popcount - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx - - - -generic altivec - - - diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py deleted file mode 100644 index 591e8b64c..000000000 --- a/volk/include/volk/make_c.py +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * -import string -from emit_omnilog import * - -#ok todo list: -#put n_archs into the info struct so it doesn't have to be arch_defs[0]. - -def make_c(machines, archs, functions, arched_arglist, my_arglist): - tempstring = r""" -// This file is automatically generated by make_c.py. -// Do not edit this file. -""" - tempstring += """ -#include -#include -#include -#include -#include -#include "volk_rank_archs.h" -#include -#include -#include - -""" - tempstring += emit_prolog(); - -#OK here's the deal. the .h prototypes the functions. the .c impls them as fptrs, can use p_whatever. -#also .c impls the get_machine call -#also .c impls the default call for each fn - -#here do static fn get arch - tempstring += r""" -struct volk_machine *get_machine(void) { - extern struct volk_machine volk_machines[]; - extern unsigned int n_volk_machines; - static struct volk_machine *machine = NULL; - - if(machine != NULL) return machine; - else { - unsigned int max_score = 0; - int i; - for(i=0; i max_score) { - max_score = volk_machines[i].caps; - machine = &(volk_machines[i]); - } - } - } - printf("Using Volk machine: %s\n", machine->name); - return machine; - } -} - -static unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { - int i; - for(i=0; i -#include - -struct VOLK_CPU volk_cpu; - -#if defined(__i386__) || (__x86_64__) -#include -#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3) - -static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[0]; -} - -static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[1]; -} - -static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[2]; -} - -static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[3]; -} -#endif - -""" - -def make_cpuid_c(dom) : - tempstring = HEADER_TEMPL; - - for domarch in dom: - if str(domarch.attributes["type"].value) == "x86": - if "no_test" in domarch.attributes.keys(): - no_test = str(domarch.attributes["no_test"].value); - if no_test == "true": - no_test = True; - else: - no_test = False; - else: - no_test = False; - arch = str(domarch.attributes["name"].value); - op = domarch.getElementsByTagName("op"); - if op: - op = str(op[0].firstChild.data); - reg = domarch.getElementsByTagName("reg"); - if reg: - reg = str(reg[0].firstChild.data); - shift = domarch.getElementsByTagName("shift"); - if shift: - shift = str(shift[0].firstChild.data); - val = domarch.getElementsByTagName("val"); - if val: - val = str(val[0].firstChild.data); - - if no_test: - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - return 1; -#else - return 0; -#endif -} - -""" % (arch) - - elif op == "1": - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - unsigned int e%sx = cpuid_e%sx (%s); - return ((e%sx >> %s) & 1) == %s; -#else - return 0; -#endif -} - -""" % (arch, reg, reg, op, reg, shift, val) - - elif op == "0x80000001": - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - unsigned int extended_fct_count = cpuid_eax(0x80000000); - if (extended_fct_count < 0x80000001) - return %s^1; - unsigned int extended_features = cpuid_e%sx (%s); - return ((extended_features >> %s) & 1) == %s; -#else - return 0; -#endif -} - -""" % (arch, val, reg, op, shift, val) - - elif str(domarch.attributes["type"].value) == "powerpc": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { -#ifdef __PPC__ - return 1; -#else - return 0; -#endif -} - -""" % (arch) - - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { - return 1; -} - -""" % (arch) - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { - return 0; -} - -""" % (arch) - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - - - - - - - diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py deleted file mode 100644 index cd3da2455..000000000 --- a/volk/include/volk/make_cpuid_h.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom -from emit_omnilog import * - -def make_cpuid_h(dom) : - tempstring = ""; - tempstring = tempstring +'/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H'; - tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n'; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + '\n' - - tempstring = tempstring + "struct VOLK_CPU {\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "};\n\n"; - tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; - - tempstring = tempstring + "void volk_cpu_init ();\n" - tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" - - tempstring = tempstring + "\n"; - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n" - - return tempstring; diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py deleted file mode 100644 index 94d6d7789..000000000 --- a/volk/include/volk/make_each_machine_c.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * -import string -from emit_omnilog import * - -#ok todo list: -#put n_archs into the info struct so it doesn't have to be arch_defs[0]. - -def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): - tempstring = r""" -// This file is automatically generated by make_each_machine_c.py. -// Do not edit this file. -""" - for arch in archs: - tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" - - tempstring += """ -#include -#include -#include - -""" - tempstring += emit_prolog(); - for func in functions: - tempstring += "#include \n" - tempstring += "\n\n" - - for i in range(len(functions)): - tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" - - tags_counter = 0 - for arch_list in fcountlist[i]: - tempstring += "#if " - for arch in arch_list: - tempstring += "defined(LV_HAVE_" + arch + ") && " - tempstring = strip_trailing(tempstring, " && ") - tempstring += "\n " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" - tempstring += "#endif\n" - tags_counter += 1 - - tempstring = strip_trailing(tempstring, ",") - tempstring += "};\n\n" - - - tempstring += "static unsigned int caps = " - for arch in archs: - tempstring += "(1 << LV_" + arch.swapcase() + ") + " - tempstring = strip_trailing(tempstring, " + ") - tempstring += ";\n" - - tempstring += "static const char* name = \"" + machine_name + "\";\n" - tempstring += "struct volk_machine volk_machine_" + machine_name + " = {\n" - tempstring += " caps,\n" - tempstring += " name,\n" - - for i in range(len(functions)): - tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs, " + functions[i] + "_n_archs },\n" - tempstring += " " + functions[i] + "_archs,\n" - - tempstring = strip_trailing(tempstring, ",") - tempstring += "};\n" - tempstring += emit_epilog(); - - return tempstring - - diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py deleted file mode 100644 index 263d5bcd1..000000000 --- a/volk/include/volk/make_environment_init_c.py +++ /dev/null @@ -1,32 +0,0 @@ -from xml.dom import minidom - -def make_environment_init_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - incs = domarch.getElementsByTagName("include"); - for inc in incs: - my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + "#include<" + my_inc + ">\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + '\n\n'; - tempstring = tempstring + "void volk_environment_init(){\n" - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - envs = domarch.getElementsByTagName("environment"); - for env in envs: - cmd = str(env.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + " " + cmd + "\n"; - tempstring = tempstring + "#endif\n" - - tempstring = tempstring + "}\n"; - return tempstring; - - - - diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py deleted file mode 100644 index 77a841a24..000000000 --- a/volk/include/volk/make_environment_init_h.py +++ /dev/null @@ -1,18 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * - -def make_environment_init_h() : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; - tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; - tempstring = tempstring + "\n"; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + "void volk_environment_init();\n"; - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif\n" - return tempstring; - - - - diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py deleted file mode 100644 index 07e62939b..000000000 --- a/volk/include/volk/make_h.py +++ /dev/null @@ -1,39 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - -# http://gcc.gnu.org/wiki/Visibility -volk_api_defines = """ -#ifdef volk_EXPORTS -# define VOLK_API __VOLK_ATTR_EXPORT -#else -# define VOLK_API __VOLK_ATTR_IMPORT -#endif -""" - -def make_h(funclist, arched_arglist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by make_h.py*/\n'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + volk_api_defines - tempstring = tempstring + emit_prolog(); - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring += "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" - tempstring += "extern VOLK_API void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) - tempstring = strip_trailing(tempstring, " {") - tempstring += "extern VOLK_API struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py deleted file mode 100644 index 9ad56fb62..000000000 --- a/volk/include/volk/make_machines_c.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * - -def make_machines_c(machines): - tempstring = r""" -// This file is automatically generated by make_machines_c.py. -// Do not edit this file. - -#include -#include -#include - -struct volk_machine volk_machines[] = { -""" - for machine in machines: - tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" - tempstring += "volk_machine_" + machine - tempstring += "," - tempstring += "\n#endif\n" - - tempstring += r""" -}; - -""" - - for machine in machines: - tempstring += "#if LV_MACHINE_" + machine.swapcase() + "\n" - tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 1\n" - tempstring += "#else\n" - tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 0\n" - tempstring += "#endif\n" - - tempstring += """unsigned int n_volk_machines = -""" - for machine in machines: - tempstring += "(LV_MACHINE_" + machine.swapcase() + "_CNT) " - tempstring += "+ " - tempstring = tempstring[:-2] - tempstring += ";\n" - - return tempstring diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py deleted file mode 100644 index 674ee12cd..000000000 --- a/volk/include/volk/make_machines_h.py +++ /dev/null @@ -1,50 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * - -def make_machines_h(functions, machines): - tempstring = r""" -// This file is automatically generated by make_machines_h.py. -// Do not edit this file. - -#ifndef INCLUDED_LIBVOLK_MACHINES_H -#define INCLUDED_LIBVOLK_MACHINES_H - -#include -#include - -struct volk_machine { - const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) - const char *name; -""" - for function in functions: - tempstring += "\n const struct volk_func_desc " + function + "_desc;\n" - tempstring += " const " + replace_volk.sub("p", function) + " *" + function + "_archs;\n" - - tempstring += r"""}; - -""" - for machine in machines: - tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" - tempstring += "extern const struct volk_machine volk_machine_" + machine + ";\n" - tempstring += """#endif\n""" - - tempstring += r""" -#endif //INCLUDED_LIBVOLK_MACHINES_H""" - - return tempstring diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py deleted file mode 100644 index 2ff492b34..000000000 --- a/volk/include/volk/make_makefile_am.py +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from xml.dom import minidom - -def make_makefile_am(dom, machines, archflags_dict): - tempstring = r""" -# This file is automatically generated by make_makefile_am.py. -# Do not edit this file. - -include $(top_srcdir)/Makefile.common - -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ - -I$(top_builddir)/include \ - -Dvolk_EXPORTS \ - -fvisibility=hidden \ - $(WITH_INCLUDES) - -lib_LTLIBRARIES = \ - libvolk.la - -EXTRA_DIST = \ - volk_rank_archs.h \ - gcc_x86_cpuid.h - -# ---------------------------------------------------------------- -# The main library -# ---------------------------------------------------------------- - -libvolk_la_SOURCES = \ - $(platform_CODE) \ - volk.cc \ - volk_cpu.c \ - volk_rank_archs.c \ - volk_machines.cc - -if LV_HAVE_ORC -volk_orc_CFLAGS = -DLV_HAVE_ORC=1 -volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 -volk_orc_LIBADD = ../orc/libvolk_orc.la -else -volk_orc_CFLAGS = -volk_orc_LDFLAGS = -volk_orc_LIBADD = -endif - -libvolk_la_CPPFLAGS = $(AM_CPPFLAGS) $(volk_orc_CFLAGS) -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD = $(volk_orc_LIBADD) - -noinst_LTLIBRARIES = - -""" - - #here be dragons - for machine_name in machines: - tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_srcdir)/include $(volk_orc_CFLAGS) " - for arch in machines[machine_name]: - if archflags_dict[arch] != "none": - tempstring += "-" + archflags_dict[arch] + " " - - tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " - tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" - tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" - tempstring += "endif\n" - - - tempstring += r""" - -# ---------------------------------------------------------------- -# The QA library. Note libvolk.la in LIBADD -# ---------------------------------------------------------------- -#libvolk_qa_la_SOURCES = \ -# qa_utils.cc - -#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost - -#libvolk_qa_la_LIBADD = \ -# libvolk.la \ -# libvolk_runtime.la - -# ---------------------------------------------------------------- -# headers that don't get installed -# ---------------------------------------------------------------- -noinst_HEADERS = \ - volk_init.h \ - qa_utils.h - -# ---------------------------------------------------------------- -# Our test program -# ---------------------------------------------------------------- -noinst_PROGRAMS = \ - testqa - -testqa_SOURCES = testqa.cc qa_utils.cc -testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) -testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) -testqa_LDADD = \ - libvolk.la - -distclean-local: - rm -f volk.c - rm -f volk_cpu_generic.c - rm -f volk_cpu_powerpc.c - rm -f volk_cpu_x86.c - rm -f volk_init.c - rm -f volk_init.h - rm -f volk_mktables.c - rm -f volk_proccpu_sim.c - rm -f volk_tables.h - rm -f volk_environment_init.c -""" - - - return tempstring diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py deleted file mode 100644 index 029dacfcc..000000000 --- a/volk/include/volk/make_proccpu_sim.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_proccpu_sim(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" - - tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; - tempstring = tempstring + " if(val==1){\n"; - tempstring = tempstring + " sprintf(buf, \"%s %s\", buf, newkey);\n"; - tempstring = tempstring + " }\n"; - tempstring = tempstring + "}\n"; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + "int main() {\n"; - tempstring = tempstring + " volk_cpu_init();\n"; - tempstring = tempstring + " char buf[2048];\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n" - tempstring = tempstring + " printf(\"%s\\n\", buf);\n" - tempstring = tempstring + "}\n" - return tempstring; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py deleted file mode 100644 index de1f46aa6..000000000 --- a/volk/include/volk/make_registry.py +++ /dev/null @@ -1,85 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * -import string - -def make_registry(dom, funclist, fcountlist, taglist) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; - tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; - tempstring = tempstring +'#include\n'; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + '\n' - - - - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring +"#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; - tempstring = tempstring +"#else\n"; - tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; - tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; - - counter = 0; - - for i in range(len(funclist)): - tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - - tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; - tags_counter = tags_counter + 1; - - tempstring = strip_trailing(tempstring, ",") - tempstring = tempstring + "};\n\n"; - - - for fcount in fcountlist: - tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; - counter += 1; - for arch_list in fcount: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - tempstring = tempstring + "\n" - tempstring = tempstring + " (1 << LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; - if ind < len(arch_list) - 1: - tempstring = tempstring + ") + (1 << LV_" - tempstring = tempstring + "),\n#endif\n" - tempstring = strip_trailing(tempstring, ",") - tempstring = tempstring + "};\n\n" - - counter = 0; - for fcount in fcountlist: - tempstring += "static const int " + funclist[counter] + "_n_archs = " - counter += 1; - for arch_list in fcount: - tempstring = tempstring + " (LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + "_CNT"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " * LV_"; - tempstring = tempstring + ") + "; - tempstring = strip_trailing(tempstring, " + "); - tempstring = tempstring + ";\n" - - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py deleted file mode 100644 index 5a848e59e..000000000 --- a/volk/include/volk/make_set_simd.py +++ /dev/null @@ -1,166 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from xml.dom import minidom - -def make_set_simd(dom, machines) : - tempstring = ""; - tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; - - tempstring = tempstring +'\ndnl define arch checks\n'; - for domarch in dom: - if str(domarch.attributes["type"].value) != "all": - arch = str(domarch.attributes["name"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n"; - tempstring = tempstring + "[\n"; - tempstring = tempstring + " LF_CHECK_CC_FLAG([-" + flag + "])\n"; - tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; - tempstring = tempstring + "])\n"; - - tempstring = tempstring +'\ndnl main set_simd_flags\n'; - tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; - tempstring = tempstring + "[\n"; - #tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; - tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; - tempstring = tempstring + " indCC=no\n"; - tempstring = tempstring + " indCXX=no\n"; - tempstring = tempstring + " indLV_ARCH=no\n"; - tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n"; - tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list("; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + arch + ", " - tempstring = tempstring[0:len(tempstring) - 2]; - - tempstring = tempstring + ")]),\n"; - tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n"; - tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; - if str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AC_DEFINE(LV_MAKE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; - tempstring = tempstring + " ADDONS=\"\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; - #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; - tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; - tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; - tempstring = tempstring + " cf_with_lv_arch=\""; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + arch + " "; - tempstring = tempstring[0:-1] + "\"\n"; - tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; - tempstring = tempstring + " fi\n"; - - tempstring = tempstring +'\ndnl init LV_MAKE_XXX and then try to add archs\n'; - for domarch in dom: - if str(domarch.attributes["type"].value) != "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=no\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - if atype != "all": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - tempstring = tempstring +'\ndnl add in flags for arch ' + arch + '\n'; - overrule = domarch.getElementsByTagName("overrule"); - if overrule: - overrule = str(overrule[0].firstChild.data); - else: - overrule = ""; - overrule_val = domarch.getElementsByTagName("overrule_val"); - if overrule_val: - overrule_val = str(overrule_val[0].firstChild.data); - else: - overrule_val = ""; - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype != "all": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - - #tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - else: - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indLV_ARCH=no\n" - - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; - - tempstring += "\n" - #now we can define the machines we're compiling - for machine_name in machines: - tempstring += " AM_CONDITIONAL(LV_MACHINE_" + machine_name.swapcase() + ", " - marchlist = machines[machine_name] - for march in marchlist: - tempstring += "test \"$LV_MAKE_" + march.swapcase() + "\" == \"yes\" && " - - tempstring += "test true)\n" #just so we don't have to detect the last one in the group, i know - tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" - tempstring = tempstring + "])\n" - - return tempstring; - - diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py deleted file mode 100644 index 8f9f2b55e..000000000 --- a/volk/include/volk/make_typedefs.py +++ /dev/null @@ -1,23 +0,0 @@ -from xml.dom import minidom -import string -from volk_regexp import * - - - -def make_typedefs(funclist, retlist, my_argtypelist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n"; - - tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; - - return tempstring; diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py deleted file mode 100644 index b83ce5206..000000000 --- a/volk/include/volk/volk_regexp.py +++ /dev/null @@ -1,14 +0,0 @@ -import re -import string - -remove_after_underscore = re.compile("_.*"); -space_remove = re.compile(" "); -leading_space_remove = re.compile("^ *"); -replace_arch = re.compile(", const char\* arch"); -replace_bracket = re.compile(" {"); -replace_volk = re.compile("volk"); - -def strip_trailing(tostrip, stripstr): - lindex = tostrip.rfind(stripstr) - tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, ""); - return tostrip diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py deleted file mode 100755 index 9b7ca73f9..000000000 --- a/volk/include/volk/volk_register.py +++ /dev/null @@ -1,291 +0,0 @@ -#! /usr/bin/env python - -import sys -import re -import string -from xml.dom import minidom -from volk_regexp import * -from make_cpuid_c import make_cpuid_c -from make_cpuid_h import make_cpuid_h -from make_set_simd import make_set_simd -from make_registry import make_registry -from make_config_fixed import make_config_fixed -from make_typedefs import make_typedefs -from make_environment_init_c import make_environment_init_c -from make_environment_init_h import make_environment_init_h -from make_makefile_am import make_makefile_am -from make_machines_h import make_machines_h -from make_machines_c import make_machines_c -from make_each_machine_c import make_each_machine_c -from make_c import make_c -from make_h import make_h -import copy - -outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); -outfile_reg = open("volk_registry.h", "w"); -outfile_h = open("volk.h", "w"); -outfile_c = open("../../lib/volk.cc", "w"); -outfile_typedefs = open("volk_typedefs.h", "w"); -outfile_init_h = open("../../lib/volk_init.h", "w"); -outfile_cpu_h = open("volk_cpu.h", "w"); -outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); -#outfile_config_in = open("../../volk_config.h.in", "w"); -outfile_config_fixed = open("volk_config_fixed.h", "w"); -#outfile_mktables = open("../../lib/volk_mktables.c", "w"); -outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); -outfile_environment_h = open("volk_environment_init.h", "w"); -outfile_makefile_am = open("../../lib/Makefile.am", "w"); -outfile_machines_h = open("volk_machines.h", "w"); -outfile_machines_c = open("../../lib/volk_machines.cc", "w"); -infile = open("Makefile.am", "r"); - - -mfile = infile.readlines(); - -datatypes = []; -functions = []; - - - -for line in mfile: - subline = re.search(".*_(a16|u)\.h.*", line); - if subline: - subsubline = re.search("(?<=volk_).*", subline.group(0)); - if subsubline: - dtype = remove_after_underscore.sub("", subsubline.group(0)); - subdtype = re.search("[0-9]+[A-z]+", dtype); - if subdtype: - datatypes.append(subdtype.group(0)); - - -datatypes = set(datatypes); - -for line in mfile: - for dt in datatypes: - if dt in line: - subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line); - if subline: - - subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); - -archs = []; -afile = minidom.parse("archs.xml"); -filearchs = afile.getElementsByTagName("arch"); -for filearch in filearchs: - archs.append(str(filearch.attributes["name"].value)); - -for arch in archs: - a_var = re.search("^\$", arch); - if a_var: - archs.remove(arch); - - - -archflags_dict = {} -for filearch in filearchs: - archflags_dict[str(filearch.attributes["name"].value)] = str(filearch.getElementsByTagName("flag")[0].firstChild.data) - -archs_or = "(" -for arch in archs: - archs_or = archs_or + string.upper(arch) + "|"; -archs_or = archs_or[0:len(archs_or)-1]; -archs_or = archs_or + ")"; - -#get machine list and parse to a list of machines, each with a list of archs (none of this DOM crap) -machine_str_dict = {} -mfile = minidom.parse("machines.xml"); -filemachines = mfile.getElementsByTagName("machine") - -for filemachine in filemachines: - machine_str_dict[str(filemachine.attributes["name"].value)] = str(filemachine.getElementsByTagName("archs")[0].firstChild.data).split() - -#all right now you have a dict of arch lists -#next we expand it -#this is an expanded list accounting for the OR syntax -#TODO: make this work for multiple "|" machines -machines = {} -already_done = False -for machine_name in machine_str_dict: - already_done = False - marchlist = machine_str_dict[machine_name] - for march in marchlist: - or_marchs = march.split("|") - if len(or_marchs) > 1: - marchlist.remove(march) - for or_march in or_marchs: - tempmarchlist = copy.deepcopy(marchlist) - tempmarchlist.append(or_march) - machines[machine_name + "_" + or_march] = tempmarchlist - already_done = True - - if not already_done: - machines[machine_name] = marchlist - -#for machine_name in machines: -# print machine_name + ": " + str(machines[machine_name]) - -#ok, now we have all the machines we're going to build. next step is to generate a Makefile.am where they're all laid out and compiled - -taglist = []; -fcountlist = []; -arched_arglist = []; -retlist = []; -my_arglist = []; -my_argtypelist = []; -for func in functions: - tags = []; - fcount = []; - infile_source = open(func + ".h"); - begun_name = 0; - begun_paren = 0; - sourcefile = infile_source.readlines(); - infile_source.close(); - for line in sourcefile: -#FIXME: make it work for multiple #if define()s - archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); - if archline: - arch = archline.group(0); - archline = re.findall(archs_or + "(?=( |\n|&))", line); - if archline: - archsublist = []; - for tup in archline: - archsublist.append(tup[0]); - fcount.append(archsublist); - testline = re.search("static inline.*?" + func, line); - if (not testline): - continue - tagline = re.search(func + "_.+", line); - if tagline: - tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); - if tag: - tag = re.search("\w+", tag.group(0)); - if tag: - tags.append(tag.group(0)); - - - if begun_name == 0: - retline = re.search(".+(?=" + func + ")", line); - if retline: - ret = retline.group(0); - - - - - subline = re.search(func + ".*", line); - if subline: - subsubline = re.search("\(.*?\)", subline.group(0)); - if subsubline: - args = subsubline.group(0); - - else: - begun_name = 1; - subsubline = re.search("\(.*", subline.group(0)); - if subsubline: - args = subsubline.group(0); - begun_paren = 1; - else: - if begun_paren == 1: - subline = re.search(".*?\)", line); - if subline: - args = args + subline.group(0); - begun_name = 0; - begun_paren = 0; - else: - subline = re.search(".*", line); - args = args + subline.group(0); - else: - subline = re.search("\(.*?\)", line); - if subline: - args = subline.group(0); - begun_name = 0; - else: - subline = re.search("\(.*", line); - if subline: - args = subline.group(0); - begun_paren = 1; - - replace = re.compile("static "); - ret = replace.sub("", ret); - replace = re.compile("inline "); - ret = replace.sub("", ret); - replace = re.compile("\)"); - arched_args = replace.sub(", const char* arch) {", args); - - remove = re.compile('\)|\(|{'); - rargs = remove.sub("", args); - sargs = rargs.split(','); - - - - margs = []; - atypes = []; - for arg in sargs: - temp = arg.split(" "); - margs.append(temp[-1]); - replace = re.compile(" " + temp[-1]); - atypes.append(replace.sub("", arg)); - - - my_args = "" - arg_types = "" - for arg in range(0, len(margs) - 1): - this_arg = leading_space_remove.sub("", margs[arg]); - my_args = my_args + this_arg + ", "; - this_type = leading_space_remove.sub("", atypes[arg]); - arg_types = arg_types + this_type + ", "; - - this_arg = leading_space_remove.sub("", margs[-1]); - my_args = my_args + this_arg; - this_type = leading_space_remove.sub("", atypes[-1]); - arg_types = arg_types + this_type; - my_argtypelist.append(arg_types); - - if(ret[-1] != ' '): - ret = ret + ' '; - - arched_arglist.append(arched_args); #!!!!!!!!!!! - my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! - retlist.append(ret); - fcountlist.append(fcount); - taglist.append(tags); - - -outfile_cpu_h.write(make_cpuid_h(filearchs)); -outfile_cpu_h.close(); - -outfile_cpu_c.write(make_cpuid_c(filearchs)); -outfile_cpu_c.close(); - -outfile_set_simd.write(make_set_simd(filearchs, machines)); -outfile_set_simd.close(); - -outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); -outfile_reg.close(); - -outfile_config_fixed.write(make_config_fixed(filearchs)); -outfile_config_fixed.close(); - -outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); -outfile_typedefs.close(); - -outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) -outfile_makefile_am.close() - -outfile_machines_h.write(make_machines_h(functions, machines)) -outfile_machines_h.close() - -outfile_machines_c.write(make_machines_c(machines)) -outfile_machines_c.close() - -outfile_c.write(make_c(machines, archs, functions, arched_arglist, my_arglist)) -outfile_c.close() - -outfile_h.write(make_h(functions, arched_arglist)) -outfile_h.close() - -for machine in machines: - machine_c_filename = "../../lib/volk_machine_" + machine + ".cc" - outfile_machine_c = open(machine_c_filename, "w") - outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) - outfile_machine_c.close() -- cgit From 12413747c90754482582e16c95b551e1b36c6074 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 11:25:00 -0700 Subject: volk: removed volk_registry.h, it was superseded by the machines --- volk/include/volk/Makefile.am | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index ea60d201a..bb4645526 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -31,7 +31,6 @@ volkinclude_HEADERS = \ volk_common.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ $(top_gendir)/include/volk/volk_typedefs.h \ - $(top_gendir)/include/volk/volk_registry.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ $(top_gendir)/include/volk/volk_machines.h \ -- cgit From 5b4c7d27e9d49ab58df1f1d9350dcaf64c60a1ce Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 12:41:16 -0700 Subject: volk: top-level common header cleanup Since we already have a volk_common.h, moved the attributes, API declaration, and c-linkage macros into volk_common.h This change removes volk_attributes.h, in favor of one common include header. The implementation headers that require attributes now include volk_common.h This change removes the emit_omnilog.py script, in favor of pre-processor macros in volk_common.h In addition, extern C is only defined when in C++ and in GCC because non-GCC does not have complex.h --- volk/include/volk/Makefile.am | 1 - volk/include/volk/volk_16ic_magnitude_16i_a16.h | 2 +- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 2 +- .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 2 +- .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 2 +- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 2 +- volk/include/volk/volk_32f_index_max_16u_a16.h | 2 +- ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 2 +- .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 2 +- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 2 +- .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 2 +- .../volk_32fc_s32f_deinterleave_real_16i_a16.h | 2 +- .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 2 +- .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 2 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 2 +- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 2 +- .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 2 +- volk/include/volk/volk_attributes.h | 56 ------------------- volk/include/volk/volk_common.h | 62 +++++++++++++++++++--- 23 files changed, 76 insertions(+), 83 deletions(-) delete mode 100644 volk/include/volk/volk_attributes.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index bb4645526..a05f0fbfd 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,7 +26,6 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ - volk_attributes.h \ volk_complex.h \ volk_common.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index 00d29b112..73c6f3390 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H #define INCLUDED_volk_16ic_magnitude_16i_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index a4f0689e5..e4a9015b4 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index 564aa1f5d..993445995 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index 637ba9fd0..a136c0535 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 94aff3a49..dd24a1e29 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H #define INCLUDED_volk_32f_accumulator_s32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index 5c19bfca0..af1f35348 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_index_max_16u_a16_H #define INCLUDED_volk_32f_index_max_16u_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 70ab3ccdb..f5b388e6d 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index 71b53ba3a..4acd2e13e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H #define INCLUDED_volk_32f_s32f_convert_16i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 095d7bd35..2927d616c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H #define INCLUDED_volk_32f_s32f_convert_32i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index 509a46609..c114ea38f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H #define INCLUDED_volk_32f_s32f_convert_8i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 779ae2d39..c2b903657 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H #define INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index 9605322d3..10d72e09d 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index 93151260f..2cd974070 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H #define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#include +#include #include diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index cab3db50d..f7ad3fd18 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 304515a5c..2460039d2 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 96afa5ae9..f67ab0607 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index 78e28c903..f221237ff 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index d404ee684..9657c8f6b 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index d68d2462a..7d778796e 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index d2cfa42f6..a2e0cd8de 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_attributes.h b/volk/include/volk/volk_attributes.h deleted file mode 100644 index 5345c1535..000000000 --- a/volk/include/volk/volk_attributes.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2011 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -#ifndef INCLUDED_VOLK_ATTRIBUTES_H -#define INCLUDED_VOLK_ATTRIBUTES_H - -//////////////////////////////////////////////////////////////////////// -// Cross-platform attribute macros -//////////////////////////////////////////////////////////////////////// -#if defined __GNUC__ -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# if __GNUC__ >= 4 -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# else -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# endif -#elif _MSC_VER -# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE __forceinline -# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) -# define __VOLK_ATTR_EXPORT __declspec(dllexport) -# define __VOLK_ATTR_IMPORT __declspec(dllimport) -#else -# define __VOLK_ATTR_ALIGNED(x) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE -# define __VOLK_ATTR_DEPRECATED -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -#endif - -#endif /* INCLUDED_VOLK_ATTRIBUTES_H */ diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 0a47ff864..1e868561e 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -1,6 +1,62 @@ #ifndef INCLUDED_LIBVOLK_COMMON_H #define INCLUDED_LIBVOLK_COMMON_H +//////////////////////////////////////////////////////////////////////// +// Cross-platform attribute macros +//////////////////////////////////////////////////////////////////////// +#if defined __GNUC__ +# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +# define __VOLK_ATTR_UNUSED __attribute__((unused)) +# define __VOLK_ATTR_INLINE __attribute__((always_inline)) +# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# if __GNUC__ >= 4 +# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +# else +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +# endif +#elif _MSC_VER +# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE __forceinline +# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +# define __VOLK_ATTR_EXPORT __declspec(dllexport) +# define __VOLK_ATTR_IMPORT __declspec(dllimport) +#else +# define __VOLK_ATTR_ALIGNED(x) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE +# define __VOLK_ATTR_DEPRECATED +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// C-linkage declaration macros +// FIXME: due to the usage of complex.h, require gcc for c-linkage +//////////////////////////////////////////////////////////////////////// +#if defined(__cplusplus) && (__GNUC__) +# define __VOLK_DECL_BEGIN extern "C" { +# define __VOLK_DECL_END } +#else +# define __VOLK_DECL_BEGIN +# define __VOLK_DECL_END +#endif + +//////////////////////////////////////////////////////////////////////// +// Define VOLK_API for library symbols +// http://gcc.gnu.org/wiki/Visibility +//////////////////////////////////////////////////////////////////////// +#ifdef volk_EXPORTS +# define VOLK_API __VOLK_ATTR_EXPORT +#else +# define VOLK_API __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// The bit128 union used by some +//////////////////////////////////////////////////////////////////////// #include #ifdef LV_HAVE_MMX #include @@ -15,10 +71,4 @@ union bit128{ }; #endif /*LV_HAVE_MMX*/ -struct volk_func_desc { - const char **indices; - const int *arch_defs; - const int n_archs; -}; - #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From c40ef84defaeed0c9ec70e45a7e4019fa6d6e1b2 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 14:27:48 -0700 Subject: volk: various backports from MSVC building 1) Added support for __cpuid intrinsic under MSVC 2) Fixed disambiguation for std::abs overload in qa code 3) Fixed bit128 union, the ifdefs were completely wrong --- volk/include/volk/volk_common.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 1e868561e..12623073c 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -57,18 +57,30 @@ //////////////////////////////////////////////////////////////////////// // The bit128 union used by some //////////////////////////////////////////////////////////////////////// -#include -#ifdef LV_HAVE_MMX -#include +#include + +#ifdef LV_HAVE_SSE +#include +#endif + +#ifdef LV_HAVE_SSE2 +#include +#endif + union bit128{ uint16_t i16[8]; uint32_t i[4]; float f[4]; double d[2]; - __m128i int_vec; + + #ifdef LV_HAVE_SSE __m128 float_vec; + #endif + + #ifdef LV_HAVE_SSE2 + __m128i int_vec; __m128d double_vec; + #endif }; -#endif /*LV_HAVE_MMX*/ #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From ffbe7a0e951808ee972054b2182b97986595a9c5 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 9 May 2011 20:22:46 -0700 Subject: volk: implement type-agnostic operators for volk_complex --- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 18 ++--- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 2 +- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 8 +-- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 8 +-- volk/include/volk/volk_complex.h | 79 +++++++++++++--------- 5 files changed, 66 insertions(+), 49 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h index 90cc4f5e7..155b93ca2 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -3,6 +3,14 @@ #include #include +#include + +//! raise a complex float to a real float power +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ + const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); + const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); + return mag*lv_cmake(cosf(arg), sinf(arg)); +} #ifdef LV_HAVE_SSE #include @@ -72,11 +80,8 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); } } #endif /* LV_HAVE_SSE */ @@ -93,12 +98,9 @@ static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, con lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 73576a766..6b22d9f81 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -131,7 +131,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result Isum += Im; } - result[0] = lv_32fc_init(Rsum,Isum); + result[0] = lv_cmake(Rsum,Isum); return; } diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index 9e8982e9b..7307ae484 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -55,10 +55,10 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe for(; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *c16Ptr++ = (int16_t)lv_creal(temp); @@ -83,10 +83,10 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cV for(number =0; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *c16Ptr++ = (int16_t)lv_creal(temp); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index fa58ff058..adc7c0599 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -75,10 +75,10 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t for(; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *cFloatPtr++ = lv_creal(temp) / scalar; @@ -104,10 +104,10 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_ for(number = 0; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *cPtr++ = (lv_creal(temp) * invScalar); diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h index b20b5cf47..5bd925044 100644 --- a/volk/include/volk/volk_complex.h +++ b/volk/include/volk/volk_complex.h @@ -2,8 +2,21 @@ #define INCLUDE_VOLK_COMPLEX_H /*! - \brief This header file is to prevent issues with having and variables in the same code as the gcc compiler does not allow that -*/ + * \brief Provide typedefs and operators for all complex types in C and C++. + * + * The typedefs encompass all signed integer and floating point types. + * Each operator function is intended to work across all data types. + * Under C++, these operators are defined as inline templates. + * Under C, these operators are defined as preprocessor macros. + * The use of macros makes the operators agnostic to the type. + * + * The following operator functions are defined: + * - lv_cmake - make a complex type from components + * - lv_creal - get the real part of the complex number + * - lv_cimag - get the imaginary part of the complex number + * - lv_conj - take the conjugate of the complex number + */ + #ifdef __cplusplus #include @@ -12,60 +25,62 @@ typedef std::complex lv_8sc_t; typedef std::complex lv_16sc_t; typedef std::complex lv_32sc_t; +typedef std::complex lv_64sc_t; typedef std::complex lv_32fc_t; typedef std::complex lv_64fc_t; -static inline float lv_creal(const lv_32fc_t x){ - return x.real(); -} - -static inline float lv_cimag(const lv_32fc_t x){ - return x.imag(); +template inline std::complex lv_cmake(const T &r, const T &i){ + return std::complex(r, i); } -static inline lv_32fc_t lv_conj(const lv_32fc_t x){ - return std::conj(x); +template inline typename T::value_type lv_creal(const T &x){ + return x.real(); } -static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ - return std::pow(x, y); +template inline typename T::value_type lv_cimag(const T &x){ + return x.imag(); } -static inline lv_32fc_t lv_32fc_init(const float x, const float y){ - return std::complex(x,y); +template inline T lv_conj(const T &x){ + return std::conj(x); } -#else +#else /* __cplusplus */ #include typedef char complex lv_8sc_t; typedef short complex lv_16sc_t; -typedef int complex lv_32sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; typedef float complex lv_32fc_t; typedef double complex lv_64fc_t; -static inline float lv_creal(const lv_32fc_t x){ - return creal(x); -} +#define lv_cmake(r, i) ((r) + _Complex_I*(i)) -static inline float lv_cimag(const lv_32fc_t x){ - return cimag(x); -} +// When GNUC is available, use the complex extensions. +// The extensions always return the correct value type. +// http://gcc.gnu.org/onlinedocs/gcc/Complex.html +#ifdef __GNUC__ -static inline lv_32fc_t lv_conj(const lv_32fc_t x){ - return conj(x); -} +#define lv_creal(x) (__real__(x)) -static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ - return cpow(x, y); -} +#define lv_cimag(x) (__imag__(x)) -static inline lv_32fc_t lv_32fc_init(const float x, const float y){ - return x + I*y; -} +#define lv_conj(x) (~(x)) + +// When not available, use the c99 complex function family, +// which always returns double regardless of the input type. +#else /* __GNUC__ */ + +#define lv_creal(x) (creal(x)) + +#define lv_cimag(x) (cimag(x)) + +#define lv_conj(x) (conj(x)) -#endif +#endif /* __GNUC__ */ +#endif /* __cplusplus */ #endif /* INCLUDE_VOLK_COMPLEX_H */ -- cgit From 21d6870a6ef5284a5941dce1484bcfff6684ffea Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 9 May 2011 20:31:56 -0700 Subject: volk: cmake support for volk (gcc + msvc) --- volk/include/volk/volk_common.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 12623073c..2c935d1fb 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -32,6 +32,14 @@ # define __VOLK_ATTR_IMPORT #endif +//////////////////////////////////////////////////////////////////////// +// Ignore annoying warnings in MSVC +//////////////////////////////////////////////////////////////////////// +#if defined(_MSC_VER) +# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data +# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#endif + //////////////////////////////////////////////////////////////////////// // C-linkage declaration macros // FIXME: due to the usage of complex.h, require gcc for c-linkage -- cgit From a1b9b5c16c53bedfe8ebab39055a36dee387a9a4 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 15:07:31 -0700 Subject: Volk: forgot to add prefs.c/h to git... --- volk/include/volk/volk_prefs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 volk/include/volk/volk_prefs.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h new file mode 100644 index 000000000..77bcb709b --- /dev/null +++ b/volk/include/volk/volk_prefs.h @@ -0,0 +1,19 @@ +#ifndef INCLUDED_VOLK_PREFS_H +#define INCLUDED_VOLK_PREFS_H + +struct volk_arch_pref { + char name[128]; + char arch[32]; +}; + +//////////////////////////////////////////////////////////////////////// +// get path to volk_config profiling info +//////////////////////////////////////////////////////////////////////// +void get_config_path(char *); + +//////////////////////////////////////////////////////////////////////// +// load prefs into global prefs struct +//////////////////////////////////////////////////////////////////////// +int load_preferences(struct volk_arch_pref **); + +#endif //INCLUDED_VOLK_PREFS_H -- cgit From 88d41e8bc991476880d40d2369018e9adf0011ff Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 16:17:46 -0700 Subject: Volk: added an AVX impl (of 32f multiply) just to see if it's any faster. It's not. --- volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h index cef17f5a6..885941abf 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -43,6 +43,45 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector -- cgit From 7d349848c60f99f9906cb57d0ebe3c7dd35096bd Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 17:39:11 -0700 Subject: Volk: avx impl for 32f_s32f_convert_32i --- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 2927d616c..3f5044313 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -5,6 +5,42 @@ #include #include +#ifdef LV_HAVE_AVX +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include /*! -- cgit From 9bfe75fd7c6a7069db2d2a98195faabf6ba248e2 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 13 May 2011 13:58:01 -0700 Subject: volk: do not install library-only headers --- volk/include/volk/Makefile.am | 3 +-- volk/include/volk/volk_prefs.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index a05f0fbfd..5f9e134bc 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -28,12 +28,11 @@ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ + volk_prefs.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ $(top_gendir)/include/volk/volk_typedefs.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ - $(top_gendir)/include/volk/volk_machines.h \ - $(top_gendir)/include/volk/volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ volk_16ic_deinterleave_16i_x2_a16.h \ diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 77bcb709b..2a7f7e79f 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -1,7 +1,11 @@ #ifndef INCLUDED_VOLK_PREFS_H #define INCLUDED_VOLK_PREFS_H -struct volk_arch_pref { +#include + +__VOLK_DECL_BEGIN + +struct VOLK_API volk_arch_pref { char name[128]; char arch[32]; }; @@ -9,11 +13,13 @@ struct volk_arch_pref { //////////////////////////////////////////////////////////////////////// // get path to volk_config profiling info //////////////////////////////////////////////////////////////////////// -void get_config_path(char *); +VOLK_API void get_config_path(char *); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -int load_preferences(struct volk_arch_pref **); +VOLK_API int load_preferences(struct volk_arch_pref **); + +__VOLK_DECL_END #endif //INCLUDED_VOLK_PREFS_H -- cgit From 9d2ee4d74d312fd788f02a85b1d24ac16e4eb4eb Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:19:48 -0700 Subject: Volk: rename aligned functions to just _a instead of _a16 --- volk/include/volk/volk_16i_branch_4_state_8_a.h | 194 +++++++++ volk/include/volk/volk_16i_branch_4_state_8_a16.h | 194 --------- volk/include/volk/volk_16i_convert_8i_a.h | 69 +++ volk/include/volk/volk_16i_convert_8i_a16.h | 69 --- volk/include/volk/volk_16i_max_star_16i_a.h | 108 +++++ volk/include/volk/volk_16i_max_star_16i_a16.h | 108 ----- .../volk/volk_16i_max_star_horizontal_16i_a.h | 130 ++++++ .../volk/volk_16i_max_star_horizontal_16i_a16.h | 130 ------ .../volk/volk_16i_permute_and_scalar_add_a.h | 139 ++++++ .../volk/volk_16i_permute_and_scalar_add_a16.h | 139 ------ volk/include/volk/volk_16i_s32f_convert_32f_a.h | 119 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 119 ------ .../include/volk/volk_16i_x4_quad_max_star_16i_a.h | 191 +++++++++ .../volk/volk_16i_x4_quad_max_star_16i_a16.h | 191 --------- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h | 136 ++++++ .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ------ .../include/volk/volk_16ic_deinterleave_16i_x2_a.h | 158 +++++++ .../volk/volk_16ic_deinterleave_16i_x2_a16.h | 158 ------- .../volk/volk_16ic_deinterleave_real_16i_a.h | 120 ++++++ .../volk/volk_16ic_deinterleave_real_16i_a16.h | 120 ------ .../volk/volk_16ic_deinterleave_real_8i_a.h | 94 +++++ .../volk/volk_16ic_deinterleave_real_8i_a16.h | 94 ----- volk/include/volk/volk_16ic_magnitude_16i_a.h | 191 +++++++++ volk/include/volk/volk_16ic_magnitude_16i_a16.h | 191 --------- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a.h | 109 +++++ .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 109 ----- .../volk/volk_16ic_s32f_deinterleave_real_32f_a.h | 126 ++++++ .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 126 ------ volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h | 180 ++++++++ .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 180 -------- volk/include/volk/volk_16u_byteswap_a.h | 77 ++++ volk/include/volk/volk_16u_byteswap_a16.h | 77 ---- volk/include/volk/volk_32f_accumulator_s32f_a.h | 68 +++ volk/include/volk/volk_32f_accumulator_s32f_a16.h | 68 --- volk/include/volk/volk_32f_convert_64f_a.h | 70 +++ volk/include/volk/volk_32f_convert_64f_a16.h | 70 --- volk/include/volk/volk_32f_index_max_16u_a.h | 149 +++++++ volk/include/volk/volk_32f_index_max_16u_a16.h | 149 ------- .../volk/volk_32f_s32f_32f_fm_detect_32f_a.h | 120 ++++++ .../volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 120 ------ ...volk_32f_s32f_calc_spectral_noise_floor_32f_a.h | 168 ++++++++ ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 168 -------- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 111 +++++ volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 111 ----- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 143 +++++++ volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 143 ------- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 118 ++++++ volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 118 ------ volk/include/volk/volk_32f_s32f_normalize_a.h | 81 ++++ volk/include/volk/volk_32f_s32f_normalize_a16.h | 81 ---- volk/include/volk/volk_32f_s32f_power_32f_a.h | 144 +++++++ volk/include/volk/volk_32f_s32f_power_32f_a16.h | 144 ------- volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 145 +++++++ volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 145 ------- volk/include/volk/volk_32f_sqrt_32f_a.h | 77 ++++ volk/include/volk/volk_32f_sqrt_32f_a16.h | 77 ---- .../volk/volk_32f_stddev_and_mean_32f_x2_a.h | 170 ++++++++ .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 170 -------- volk/include/volk/volk_32f_x2_add_32f_a.h | 81 ++++ volk/include/volk/volk_32f_x2_add_32f_a16.h | 81 ---- volk/include/volk/volk_32f_x2_divide_32f_a.h | 82 ++++ volk/include/volk/volk_32f_x2_divide_32f_a16.h | 82 ---- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 185 ++++++++ volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 185 -------- volk/include/volk/volk_32f_x2_interleave_32fc_a.h | 75 ++++ .../include/volk/volk_32f_x2_interleave_32fc_a16.h | 75 ---- volk/include/volk/volk_32f_x2_max_32f_a.h | 85 ++++ volk/include/volk/volk_32f_x2_max_32f_a16.h | 85 ---- volk/include/volk/volk_32f_x2_min_32f_a.h | 85 ++++ volk/include/volk/volk_32f_x2_min_32f_a16.h | 85 ---- volk/include/volk/volk_32f_x2_multiply_32f_a.h | 120 ++++++ volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 120 ------ .../volk/volk_32f_x2_s32f_interleave_16ic_a.h | 156 +++++++ .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 156 ------- volk/include/volk/volk_32f_x2_subtract_32f_a.h | 81 ++++ volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 81 ---- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 151 +++++++ .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 ------- volk/include/volk/volk_32fc_32f_multiply_32fc_a.h | 95 +++++ .../include/volk/volk_32fc_32f_multiply_32fc_a16.h | 95 ----- .../include/volk/volk_32fc_deinterleave_32f_x2_a.h | 75 ++++ .../volk/volk_32fc_deinterleave_32f_x2_a16.h | 75 ---- .../include/volk/volk_32fc_deinterleave_64f_x2_a.h | 78 ++++ .../volk/volk_32fc_deinterleave_64f_x2_a16.h | 78 ---- .../volk/volk_32fc_deinterleave_real_32f_a.h | 68 +++ .../volk/volk_32fc_deinterleave_real_32f_a16.h | 68 --- .../volk/volk_32fc_deinterleave_real_64f_a.h | 66 +++ .../volk/volk_32fc_deinterleave_real_64f_a16.h | 66 --- volk/include/volk/volk_32fc_index_max_16u_a.h | 215 ++++++++++ volk/include/volk/volk_32fc_index_max_16u_a16.h | 215 ---------- volk/include/volk/volk_32fc_magnitude_32f_a.h | 132 ++++++ volk/include/volk/volk_32fc_magnitude_32f_a16.h | 132 ------ volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 158 +++++++ volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 158 ------- .../volk/volk_32fc_s32f_deinterleave_real_16i_a.h | 81 ++++ .../volk_32fc_s32f_deinterleave_real_16i_a16.h | 81 ---- volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h | 159 +++++++ .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 159 ------- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 111 +++++ volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 111 ----- .../volk/volk_32fc_s32f_power_spectrum_32f_a.h | 126 ++++++ .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 126 ------ ...olk_32fc_s32f_x2_power_spectral_density_32f_a.h | 134 ++++++ ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ------ .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 345 +++++++++++++++ .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 345 --------------- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 469 +++++++++++++++++++++ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 469 --------------------- volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 94 +++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 94 ----- ...lk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 126 ++++++ ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------ volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 112 +++++ .../volk/volk_32fc_x2_square_dist_32f_a16.h | 112 ----- volk/include/volk/volk_32i_s32f_convert_32f_a.h | 73 ++++ volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 73 ---- volk/include/volk/volk_32i_x2_and_32i_a.h | 81 ++++ volk/include/volk/volk_32i_x2_and_32i_a16.h | 81 ---- volk/include/volk/volk_32i_x2_or_32i_a.h | 81 ++++ volk/include/volk/volk_32i_x2_or_32i_a16.h | 81 ---- volk/include/volk/volk_32u_byteswap_a.h | 77 ++++ volk/include/volk/volk_32u_byteswap_a16.h | 77 ---- volk/include/volk/volk_32u_popcnt_a.h | 36 ++ volk/include/volk/volk_32u_popcnt_a16.h | 36 -- volk/include/volk/volk_64f_convert_32f_a.h | 67 +++ volk/include/volk/volk_64f_convert_32f_a16.h | 67 --- volk/include/volk/volk_64f_x2_max_64f_a.h | 71 ++++ volk/include/volk/volk_64f_x2_max_64f_a16.h | 71 ---- volk/include/volk/volk_64f_x2_min_64f_a.h | 71 ++++ volk/include/volk/volk_64f_x2_min_64f_a16.h | 71 ---- volk/include/volk/volk_64u_byteswap_a.h | 88 ++++ volk/include/volk/volk_64u_byteswap_a16.h | 88 ---- volk/include/volk/volk_64u_popcnt_a.h | 50 +++ volk/include/volk/volk_64u_popcnt_a16.h | 50 --- volk/include/volk/volk_8i_convert_16i_a.h | 83 ++++ volk/include/volk/volk_8i_convert_16i_a16.h | 83 ---- volk/include/volk/volk_8i_s32f_convert_32f_a.h | 106 +++++ volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 106 ----- volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h | 77 ++++ .../volk/volk_8ic_deinterleave_16i_x2_a16.h | 77 ---- .../volk/volk_8ic_deinterleave_real_16i_a.h | 66 +++ .../volk/volk_8ic_deinterleave_real_16i_a16.h | 66 --- .../include/volk/volk_8ic_deinterleave_real_8i_a.h | 67 +++ .../volk/volk_8ic_deinterleave_real_8i_a16.h | 67 --- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a.h | 165 ++++++++ .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 165 -------- .../volk/volk_8ic_s32f_deinterleave_real_32f_a.h | 134 ++++++ .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 134 ------ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 101 +++++ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 101 ----- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 122 ++++++ .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ------ 152 files changed, 9066 insertions(+), 9066 deletions(-) create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a.h delete mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_a.h delete mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_16i_a.h delete mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a.h delete mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a.h delete mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h delete mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h delete mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a.h delete mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_16u_byteswap_a.h delete mode 100644 volk/include/volk/volk_16u_byteswap_a16.h create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a.h delete mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_a.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h create mode 100644 volk/include/volk/volk_32f_index_max_16u_a.h delete mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a.h delete mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a.h delete mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h delete mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a.h delete mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a.h delete mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a.h delete mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h create mode 100644 volk/include/volk/volk_32u_byteswap_a.h delete mode 100644 volk/include/volk/volk_32u_byteswap_a16.h create mode 100644 volk/include/volk/volk_32u_popcnt_a.h delete mode 100644 volk/include/volk/volk_32u_popcnt_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a.h delete mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a.h delete mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h create mode 100644 volk/include/volk/volk_64u_byteswap_a.h delete mode 100644 volk/include/volk/volk_64u_byteswap_a16.h create mode 100644 volk/include/volk/volk_64u_popcnt_a.h delete mode 100644 volk/include/volk/volk_64u_popcnt_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_a.h delete mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h delete mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h delete mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h new file mode 100644 index 000000000..5eb03b346 --- /dev/null +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H +#define INCLUDED_volk_16i_branch_4_state_8_a16_H + + +#include +#include + + + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h deleted file mode 100644 index 5eb03b346..000000000 --- a/volk/include/volk/volk_16i_branch_4_state_8_a16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H -#define INCLUDED_volk_16i_branch_4_state_8_a16_H - - -#include -#include - - - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h new file mode 100644 index 000000000..4d51e5903 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16i_convert_8i_a16_H +#define INCLUDED_volk_16i_convert_8i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h deleted file mode 100644 index 4d51e5903..000000000 --- a/volk/include/volk/volk_16i_convert_8i_a16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a16_H -#define INCLUDED_volk_16i_convert_8i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h new file mode 100644 index 000000000..063444279 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16i_max_star_16i_a16_H +#define INCLUDED_volk_16i_max_star_16i_a16_H + + +#include +#include + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h deleted file mode 100644 index 063444279..000000000 --- a/volk/include/volk/volk_16i_max_star_16i_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16i_max_star_16i_a16_H -#define INCLUDED_volk_16i_max_star_16i_a16_H - - -#include -#include - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#ifdef LV_HAVE_GENERIC - -static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h new file mode 100644 index 000000000..ece6adb40 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H + + +#include +#include + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h deleted file mode 100644 index ece6adb40..000000000 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H -#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H - - -#include -#include - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h new file mode 100644 index 000000000..ae1a18157 --- /dev/null +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#ifdef LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h deleted file mode 100644 index ae1a18157..000000000 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H - - -#include -#include - - - - -#ifdef LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h new file mode 100644 index 000000000..09bc252f0 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H +#define INCLUDED_volk_16i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h deleted file mode 100644 index 09bc252f0..000000000 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H -#define INCLUDED_volk_16i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h new file mode 100644 index 000000000..94e5eb986 --- /dev/null +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H + + +#include +#include + + + + + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + + "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h deleted file mode 100644 index 94e5eb986..000000000 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H -#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H - - -#include -#include - - - - - -#ifdef LV_HAVE_SSE2 - -#include - -static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" - - "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h new file mode 100644 index 000000000..c157bf64a --- /dev/null +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H + + +#include +#include + + + + + +#ifdef LV_HAVE_SSE2 +#include +#include + +static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h deleted file mode 100644 index c157bf64a..000000000 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H -#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H - - -#include -#include - - - - - -#ifdef LV_HAVE_SSE2 -#include -#include - -static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h new file mode 100644 index 000000000..227a92303 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h deleted file mode 100644 index 227a92303..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h new file mode 100644 index 000000000..35d0e8be2 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h deleted file mode 100644 index 35d0e8be2..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h new file mode 100644 index 000000000..bdf5fc162 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h deleted file mode 100644 index bdf5fc162..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h new file mode 100644 index 000000000..73c6f3390 --- /dev/null +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H +#define INCLUDED_volk_16ic_magnitude_16i_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h deleted file mode 100644 index 73c6f3390..000000000 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H -#define INCLUDED_volk_16ic_magnitude_16i_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..e4a9015b4 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h deleted file mode 100644 index e4a9015b4..000000000 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h new file mode 100644 index 000000000..993445995 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 993445995..000000000 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h new file mode 100644 index 000000000..a136c0535 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -0,0 +1,180 @@ +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h deleted file mode 100644 index a136c0535..000000000 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h new file mode 100644 index 000000000..f393c05c5 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_16u_byteswap_a16_H +#define INCLUDED_volk_16u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h deleted file mode 100644 index f393c05c5..000000000 --- a/volk/include/volk/volk_16u_byteswap_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_16u_byteswap_a16_H -#define INCLUDED_volk_16u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h new file mode 100644 index 000000000..dd24a1e29 --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H +#define INCLUDED_volk_32f_accumulator_s32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h deleted file mode 100644 index dd24a1e29..000000000 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H -#define INCLUDED_volk_32f_accumulator_s32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h new file mode 100644 index 000000000..8ca83220b --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_a16_H +#define INCLUDED_volk_32f_convert_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h deleted file mode 100644 index 8ca83220b..000000000 --- a/volk/include/volk/volk_32f_convert_64f_a16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a16_H -#define INCLUDED_volk_32f_convert_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h new file mode 100644 index 000000000..af1f35348 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -0,0 +1,149 @@ +#ifndef INCLUDED_volk_32f_index_max_16u_a16_H +#define INCLUDED_volk_32f_index_max_16u_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h deleted file mode 100644 index af1f35348..000000000 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef INCLUDED_volk_32f_index_max_16u_a16_H -#define INCLUDED_volk_32f_index_max_16u_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_SSE -#include - -static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - float max = src0[0]; - unsigned int index = 0; - - int i = 1; - - for(; i < num_points; ++i) { - - if(src0[i] > max){ - index = i; - max = src0[i]; - } - - } - target[0] = index; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h new file mode 100644 index 000000000..6efd21a37 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h deleted file mode 100644 index 6efd21a37..000000000 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H -#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_noints The number of real values in the input vector. -*/ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int quarterPoints = (num_points-1) / 4; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m128 upperBound = _mm_set_ps1(bound); - __m128 lowerBound = _mm_set_ps1(-bound); - __m128 next3old1; - __m128 next4; - __m128 boundAdjust; - __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. - __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. - // Do the first 4 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < quarterPoints; number++) { - // Load data - next3old1 = _mm_loadu_ps((float*) (inPtr-1)); - next4 = _mm_load_ps(inPtr); - inPtr += 4; - // Subtract and store: - next3old1 = _mm_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); - boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm_cmplt_ps(next3old1, lowerBound); - next4 = _mm_and_ps(next4, negBoundAdjust); - boundAdjust = _mm_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm_add_ps(next3old1, boundAdjust); - _mm_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 4; - } - - for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_points The number of real values in the input vector. -*/ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 0; - float* outPtr = outputVector; - const float* inPtr = inputVector; - - // Do the first 1 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - - for (number = 1; number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h new file mode 100644 index 000000000..f5b388e6d --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -0,0 +1,168 @@ +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h deleted file mode 100644 index f5b388e6d..000000000 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h new file mode 100644 index 000000000..4acd2e13e --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H +#define INCLUDED_volk_32f_s32f_convert_16i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h deleted file mode 100644 index 4acd2e13e..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H -#define INCLUDED_volk_32f_s32f_convert_16i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h new file mode 100644 index 000000000..3f5044313 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -0,0 +1,143 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H +#define INCLUDED_volk_32f_s32f_convert_32i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h deleted file mode 100644 index 3f5044313..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H -#define INCLUDED_volk_32f_s32f_convert_32i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_AVX -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1; - __m256i intInputVal1; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h new file mode 100644 index 000000000..c114ea38f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H +#define INCLUDED_volk_32f_s32f_convert_8i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h deleted file mode 100644 index c114ea38f..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H -#define INCLUDED_volk_32f_s32f_convert_8i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/include/volk/volk_32f_s32f_normalize_a.h new file mode 100644 index 000000000..e6195cd32 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_normalize_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H +#define INCLUDED_volk_32f_s32f_normalize_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h deleted file mode 100644 index e6195cd32..000000000 --- a/volk/include/volk/volk_32f_s32f_normalize_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H -#define INCLUDED_volk_32f_s32f_normalize_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) - \param vecBuffer The buffer of values to be vectorized - \param num_points The number of values in vecBuffer - \param scalar The scale value to be applied to each buffer value -*/ -static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - - const float invScalar = 1.0 / scalar; - __m128 vecScalar = _mm_set_ps1(invScalar); - - __m128 input1; - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - - input1 = _mm_load_ps(inputPtr); - - input1 = _mm_mul_ps(input1, vecScalar); - - _mm_store_ps(inputPtr, input1); - - inputPtr += 4; - } - - number = quarterPoints*4; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h new file mode 100644 index 000000000..ecff901e1 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H +#define INCLUDED_volk_32f_s32f_power_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h deleted file mode 100644 index ecff901e1..000000000 --- a/volk/include/volk/volk_32f_s32f_power_32f_a16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H -#define INCLUDED_volk_32f_s32f_power_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h new file mode 100644 index 000000000..c2b903657 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -0,0 +1,145 @@ +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h deleted file mode 100644 index c2b903657..000000000 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h new file mode 100644 index 000000000..a9ce76f88 --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H +#define INCLUDED_volk_32f_sqrt_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h deleted file mode 100644 index a9ce76f88..000000000 --- a/volk/include/volk/volk_32f_sqrt_32f_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H -#define INCLUDED_volk_32f_sqrt_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); -} - -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h new file mode 100644 index 000000000..10d72e09d --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -0,0 +1,170 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h deleted file mode 100644 index 10d72e09d..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h new file mode 100644 index 000000000..2de6a6644 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H +#define INCLUDED_volk_32f_x2_add_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h deleted file mode 100644 index 2de6a6644..000000000 --- a/volk/include/volk/volk_32f_x2_add_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H -#define INCLUDED_volk_32f_x2_add_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h new file mode 100644 index 000000000..1603e78de --- /dev/null +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H +#define INCLUDED_volk_32f_x2_divide_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h deleted file mode 100644 index 1603e78de..000000000 --- a/volk/include/volk/volk_32f_x2_divide_32f_a16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H -#define INCLUDED_volk_32f_x2_divide_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h new file mode 100644 index 000000000..2cd974070 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -0,0 +1,185 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h deleted file mode 100644 index 2cd974070..000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include - -static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h new file mode 100644 index 000000000..f3731fa2a --- /dev/null +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h deleted file mode 100644 index f3731fa2a..000000000 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H -#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h new file mode 100644 index 000000000..60be6e36d --- /dev/null +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H +#define INCLUDED_volk_32f_x2_max_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h deleted file mode 100644 index 60be6e36d..000000000 --- a/volk/include/volk/volk_32f_x2_max_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H -#define INCLUDED_volk_32f_x2_max_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h new file mode 100644 index 000000000..3b8291531 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H +#define INCLUDED_volk_32f_x2_min_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h deleted file mode 100644 index 3b8291531..000000000 --- a/volk/include/volk/volk_32f_x2_min_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H -#define INCLUDED_volk_32f_x2_min_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h new file mode 100644 index 000000000..885941abf --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H +#define INCLUDED_volk_32f_x2_multiply_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h deleted file mode 100644 index 885941abf..000000000 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H -#define INCLUDED_volk_32f_x2_multiply_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include -/*! - \brief Multiplies the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h new file mode 100644 index 000000000..f7ad3fd18 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -0,0 +1,156 @@ +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h deleted file mode 100644 index f7ad3fd18..000000000 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h new file mode 100644 index 000000000..c01f2c1f3 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H +#define INCLUDED_volk_32f_x2_subtract_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h deleted file mode 100644 index c01f2c1f3..000000000 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H -#define INCLUDED_volk_32f_x2_subtract_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h new file mode 100644 index 000000000..6e446cbef --- /dev/null +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h deleted file mode 100644 index 6e446cbef..000000000 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC - -static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h new file mode 100644 index 000000000..846315a4a --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h deleted file mode 100644 index 846315a4a..000000000 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies the input complex vector with the input float vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the float values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; - for(;number < quarterPoints; number++){ - - aVal1 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - bVal = _mm_load_ps(bPtr); - bPtr += 4; - - bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); - bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - - cVal = _mm_mul_ps(aVal1, bVal1); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - cPtr += 2; - - cVal = _mm_mul_ps(aVal2, bVal2); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr); - bPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - -#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..3e7c3fa28 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h deleted file mode 100644 index 3e7c3fa28..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h new file mode 100644 index 000000000..945a26742 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h deleted file mode 100644 index 945a26742..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h new file mode 100644 index 000000000..3c3fb2583 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h deleted file mode 100644 index 3c3fb2583..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; - - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h new file mode 100644 index 000000000..40c1a7a46 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h deleted file mode 100644 index 40c1a7a46..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - iBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h new file mode 100644 index 000000000..0ad1edbe9 --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H +#define INCLUDED_volk_32fc_index_max_16u_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h deleted file mode 100644 index 0ad1edbe9..000000000 --- a/volk/include/volk/volk_32fc_index_max_16u_a16.h +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H -#define INCLUDED_volk_32fc_index_max_16u_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - - -static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; - - - - - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); - - - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); -; - - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - - for(; i < bound; ++i) { - - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); - - - src0 += 4; - - - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); - - - xmm1 = _mm_hadd_ps(xmm1, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - - } - - - for(i = 0; i < leftovers0; ++i) { - - - xmm2 = _mm_load_ps((float*)src0); - - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; - - xmm2 = _mm_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm_hadd_ps(xmm2, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - } - - - - - for(i = 0; i < leftovers1; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - xmm2 = _mm_load1_ps(&sq_dist); - - xmm1 = xmm3; - - xmm3 = _mm_max_ss(xmm3, xmm2); - - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - } - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - unsigned int l0 = g0 ^ 1; - - unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - unsigned int l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ - -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - float sq_dist = 0.0; - float max = 0.0; - unsigned int index = 0; - - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - - - } - target[0] = index; - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h new file mode 100644 index 000000000..946190e41 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -0,0 +1,132 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H +#define INCLUDED_volk_32fc_magnitude_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h deleted file mode 100644 index 946190e41..000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_a16.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H -#define INCLUDED_volk_32fc_magnitude_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h new file mode 100644 index 000000000..55b1b6c70 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h deleted file mode 100644 index 55b1b6c70..000000000 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H -#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 mask; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_and_ps(phase, keepMask); - mask = _mm_andnot_ps(keepMask, correctVector); - phase = _mm_or_ps(phase, mask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The vector where the results will be stored. - \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ - float* outPtr = outputVector; - const float* inPtr = (float*)inputVector; - const float invNormalizeFactor = 1.0 / normalizeFactor; - unsigned int number; - for ( number = 0; number < num_points; number++) { - const float real = *inPtr++; - const float imag = *inPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h new file mode 100644 index 000000000..2460039d2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h deleted file mode 100644 index 2460039d2..000000000 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h new file mode 100644 index 000000000..f67ab0607 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h @@ -0,0 +1,159 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h deleted file mode 100644 index f67ab0607..000000000 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h new file mode 100644 index 000000000..155b93ca2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H + +#include +#include +#include + +//! raise a complex float to a real float power +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ + const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); + const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); + return mag*lv_cmake(cosf(arg), sinf(arg)); +} + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h deleted file mode 100644 index 155b93ca2..000000000 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H - -#include -#include -#include - -//! raise a complex float to a real float power -static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ - const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); - const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); - return mag*lv_cmake(cosf(arg), sinf(arg)); -} - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h new file mode 100644 index 000000000..03da069c2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h deleted file mode 100644 index 03da069c2..000000000 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H -#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iNormalizationFactor = 1.0 / normalizationFactor; -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided agains all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - const float iNormalizationFactor = 1.0 / normalizationFactor; - unsigned int point; - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h new file mode 100644 index 000000000..5bcd7f7c4 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h deleted file mode 100644 index 5bcd7f7c4..000000000 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h new file mode 100644 index 000000000..f221237ff --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -0,0 +1,345 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H + +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h deleted file mode 100644 index f221237ff..000000000 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ /dev/null @@ -1,345 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H - -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h new file mode 100644 index 000000000..9657c8f6b --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -0,0 +1,469 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h deleted file mode 100644 index 9657c8f6b..000000000 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ /dev/null @@ -1,469 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H - -#include -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h new file mode 100644 index 000000000..72010b855 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h deleted file mode 100644 index 72010b855..000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h new file mode 100644 index 000000000..910f51679 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h deleted file mode 100644 index 910f51679..000000000 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h new file mode 100644 index 000000000..551f3cb53 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h deleted file mode 100644 index 551f3cb53..000000000 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h new file mode 100644 index 000000000..b744c7197 --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H +#define INCLUDED_volk_32i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h deleted file mode 100644 index b744c7197..000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H -#define INCLUDED_volk_32i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h new file mode 100644 index 000000000..4d50efd32 --- /dev/null +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H +#define INCLUDED_volk_32i_x2_and_32i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h deleted file mode 100644 index 4d50efd32..000000000 --- a/volk/include/volk/volk_32i_x2_and_32i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H -#define INCLUDED_volk_32i_x2_and_32i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h new file mode 100644 index 000000000..9edbdbafd --- /dev/null +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H +#define INCLUDED_volk_32i_x2_or_32i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h deleted file mode 100644 index 9edbdbafd..000000000 --- a/volk/include/volk/volk_32i_x2_or_32i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H -#define INCLUDED_volk_32i_x2_or_32i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h new file mode 100644 index 000000000..dc5cedab9 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_a16_H +#define INCLUDED_volk_32u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h deleted file mode 100644 index dc5cedab9..000000000 --- a/volk/include/volk/volk_32u_byteswap_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32u_byteswap_a16_H -#define INCLUDED_volk_32u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/include/volk/volk_32u_popcnt_a.h new file mode 100644 index 000000000..0d8b48fd5 --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_a.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H +#define INCLUDED_VOLK_32u_POPCNT_A16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h deleted file mode 100644 index 0d8b48fd5..000000000 --- a/volk/include/volk/volk_32u_popcnt_a16.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H -#define INCLUDED_VOLK_32u_POPCNT_A16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { - - // This is faster than a lookup table - uint32_t retVal = value; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - - *ret = retVal; -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE4_2 - -#include - -static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { - *ret = _mm_popcnt_u32(value); -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h new file mode 100644 index 000000000..cfcdbdc3a --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_a16_H +#define INCLUDED_volk_64f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h deleted file mode 100644 index cfcdbdc3a..000000000 --- a/volk/include/volk/volk_64f_convert_32f_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a16_H -#define INCLUDED_volk_64f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h new file mode 100644 index 000000000..21f488bf7 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H +#define INCLUDED_volk_64f_x2_max_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h deleted file mode 100644 index 21f488bf7..000000000 --- a/volk/include/volk/volk_64f_x2_max_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H -#define INCLUDED_volk_64f_x2_max_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h new file mode 100644 index 000000000..8711a0eae --- /dev/null +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H +#define INCLUDED_volk_64f_x2_min_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h deleted file mode 100644 index 8711a0eae..000000000 --- a/volk/include/volk/volk_64f_x2_min_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H -#define INCLUDED_volk_64f_x2_min_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h new file mode 100644 index 000000000..b4bed8451 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_a16_H +#define INCLUDED_volk_64u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h deleted file mode 100644 index b4bed8451..000000000 --- a/volk/include/volk/volk_64u_byteswap_a16.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_volk_64u_byteswap_a16_H -#define INCLUDED_volk_64u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h new file mode 100644 index 000000000..8b92e91a1 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -0,0 +1,50 @@ +#ifndef INCLUDED_volk_64u_popcnt_a16_H +#define INCLUDED_volk_64u_popcnt_a16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 && LV_HAVE_64 + +#include + +static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { + *ret = _mm_popcnt_u64(value); + +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h deleted file mode 100644 index 8b92e91a1..000000000 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef INCLUDED_volk_64u_popcnt_a16_H -#define INCLUDED_volk_64u_popcnt_a16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { - - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; - -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 && LV_HAVE_64 - -#include - -static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { - *ret = _mm_popcnt_u64(value); - -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/include/volk/volk_8i_convert_16i_a.h new file mode 100644 index 000000000..260ac40a1 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_a.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8i_convert_16i_a16_H +#define INCLUDED_volk_8i_convert_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h deleted file mode 100644 index 260ac40a1..000000000 --- a/volk/include/volk/volk_8i_convert_16i_a16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_volk_8i_convert_16i_a16_H -#define INCLUDED_volk_8i_convert_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h new file mode 100644 index 000000000..9991b150e --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H +#define INCLUDED_volk_8i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h deleted file mode 100644 index 9991b150e..000000000 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H -#define INCLUDED_volk_8i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h new file mode 100644 index 000000000..249acab49 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h deleted file mode 100644 index 249acab49..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h new file mode 100644 index 000000000..7b64b37c5 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h deleted file mode 100644 index 7b64b37c5..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h new file mode 100644 index 000000000..a1abad487 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h deleted file mode 100644 index a1abad487..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..7d778796e --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -0,0 +1,165 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h deleted file mode 100644 index 7d778796e..000000000 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,165 +0,0 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h new file mode 100644 index 000000000..a2e0cd8de --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index a2e0cd8de..000000000 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h new file mode 100644 index 000000000..7307ae484 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -0,0 +1,101 @@ +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h deleted file mode 100644 index 7307ae484..000000000 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H -#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..adc7c0599 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h deleted file mode 100644 index adc7c0599..000000000 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ -- cgit From 5f145a323af39c631e55e760e69a797126a54eb3 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:20:50 -0700 Subject: Volk: rename functions _a instead of _a16 --- volk/include/volk/volk_16i_branch_4_state_8_a.h | 10 +++++----- volk/include/volk/volk_16i_convert_8i_a.h | 10 +++++----- volk/include/volk/volk_16i_max_star_16i_a.h | 10 +++++----- volk/include/volk/volk_16i_max_star_horizontal_16i_a.h | 10 +++++----- volk/include/volk/volk_16i_permute_and_scalar_add_a.h | 10 +++++----- volk/include/volk/volk_16i_s32f_convert_32f_a.h | 12 ++++++------ volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h | 18 +++++++++--------- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h | 18 +++++++++--------- volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h | 18 +++++++++--------- volk/include/volk/volk_16ic_deinterleave_real_16i_a.h | 12 ++++++------ volk/include/volk/volk_16ic_deinterleave_real_8i_a.h | 16 ++++++++-------- volk/include/volk/volk_16ic_magnitude_16i_a.h | 18 +++++++++--------- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a.h | 16 ++++++++-------- .../volk/volk_16ic_s32f_deinterleave_real_32f_a.h | 12 ++++++------ volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_16u_byteswap_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_accumulator_s32f_a.h | 10 +++++----- volk/include/volk/volk_32f_convert_64f_a.h | 10 +++++----- volk/include/volk/volk_32f_index_max_16u_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h | 10 +++++----- .../volk_32f_s32f_calc_spectral_noise_floor_32f_a.h | 10 +++++----- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_convert_32i_a.h | 14 +++++++------- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_normalize_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_s32f_power_32f_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 12 ++++++------ volk/include/volk/volk_32f_sqrt_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h | 12 ++++++------ volk/include/volk/volk_32f_x2_add_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_divide_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 14 +++++++------- volk/include/volk/volk_32f_x2_interleave_32fc_a.h | 10 +++++----- volk/include/volk/volk_32f_x2_max_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_min_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_multiply_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h | 12 ++++++------ volk/include/volk/volk_32f_x2_subtract_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_32f_multiply_32fc_a.h | 16 ++++++++-------- volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_real_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_real_64f_a.h | 10 +++++----- volk/include/volk/volk_32fc_index_max_16u_a.h | 10 +++++----- volk/include/volk/volk_32fc_magnitude_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 12 ++++++------ .../volk/volk_32fc_s32f_deinterleave_real_16i_a.h | 10 +++++----- volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 16 ++++++++-------- .../include/volk/volk_32fc_s32f_power_spectrum_32f_a.h | 10 +++++----- .../volk_32fc_s32f_x2_power_spectral_density_32f_a.h | 10 +++++----- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 12 ++++++------ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 16 ++++++++-------- .../volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 10 +++++----- volk/include/volk/volk_32i_s32f_convert_32f_a.h | 10 +++++----- volk/include/volk/volk_32i_x2_and_32i_a.h | 16 ++++++++-------- volk/include/volk/volk_32i_x2_or_32i_a.h | 16 ++++++++-------- volk/include/volk/volk_32u_byteswap_a.h | 10 +++++----- volk/include/volk/volk_32u_popcnt_a.h | 4 ++-- volk/include/volk/volk_64f_convert_32f_a.h | 10 +++++----- volk/include/volk/volk_64f_x2_max_64f_a.h | 10 +++++----- volk/include/volk/volk_64f_x2_min_64f_a.h | 10 +++++----- volk/include/volk/volk_64u_byteswap_a.h | 10 +++++----- volk/include/volk/volk_64u_popcnt_a.h | 10 +++++----- volk/include/volk/volk_8i_convert_16i_a.h | 14 +++++++------- volk/include/volk/volk_8i_s32f_convert_32f_a.h | 14 +++++++------- volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h | 10 +++++----- volk/include/volk/volk_8ic_deinterleave_real_16i_a.h | 10 +++++----- volk/include/volk/volk_8ic_deinterleave_real_8i_a.h | 4 ++-- .../include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h | 12 ++++++------ .../volk/volk_8ic_s32f_deinterleave_real_32f_a.h | 12 ++++++------ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 10 +++++----- .../volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 10 +++++----- 76 files changed, 477 insertions(+), 477 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h index 5eb03b346..0424e66e9 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H -#define INCLUDED_volk_16i_branch_4_state_8_a16_H +#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H +#define INCLUDED_volk_16i_branch_4_state_8_a_H #include @@ -14,7 +14,7 @@ #include #include -static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; @@ -138,7 +138,7 @@ static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* s #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; int bound = 4; @@ -191,4 +191,4 @@ static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ +#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h index 4d51e5903..8046035c7 100644 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a16_H -#define INCLUDED_volk_16i_convert_8i_a16_H +#ifndef INCLUDED_volk_16i_convert_8i_a_H +#define INCLUDED_volk_16i_convert_8i_a_H #include #include @@ -12,7 +12,7 @@ \param outputVector The 8 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -52,7 +52,7 @@ static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int1 \param outputVector The 8 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -66,4 +66,4 @@ static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const i -#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ +#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h index 063444279..6a4f63708 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_max_star_16i_a16_H -#define INCLUDED_volk_16i_max_star_16i_a16_H +#ifndef INCLUDED_volk_16i_max_star_16i_a_H +#define INCLUDED_volk_16i_max_star_16i_a_H #include @@ -12,7 +12,7 @@ #include #include -static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) { @@ -87,7 +87,7 @@ static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) { int i = 0; @@ -105,4 +105,4 @@ static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index ece6adb40..f60b33a41 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H -#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H #include @@ -12,7 +12,7 @@ #include #include -static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; @@ -110,7 +110,7 @@ static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { int i = 0; @@ -127,4 +127,4 @@ static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h index ae1a18157..de36cee80 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a_H #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -117,7 +117,7 @@ static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, sho #ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { int i = 0; @@ -136,4 +136,4 @@ static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, sh #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h index 09bc252f0..0555fdf00 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H -#define INCLUDED_volk_16i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H +#define INCLUDED_volk_16i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -68,7 +68,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, con \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -102,7 +102,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -116,4 +116,4 @@ static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, co -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h index 94e5eb986..2688aff04 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H -#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H #include @@ -13,7 +13,7 @@ #include -static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { +static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { @@ -96,9 +96,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* /*asm volatile ( - "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" - "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" "movaps (%[src0]), %%xmm1\n\t" "movaps (%[src1]), %%xmm2\n\t" @@ -143,9 +143,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* "movaps %%xmm1, (%[target])\n\t" "addw $16, %[target]\n\t" - "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" - "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) : @@ -168,7 +168,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { +static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { int i = 0; @@ -188,4 +188,4 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, shor #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h index c157bf64a..e4c9f17ed 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H -#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; @@ -65,9 +65,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* } /*asm volatile ( - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" - "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" "movaps (%[src0]), %%xmm1\n\t" "movaps (%[src1]), %%xmm2\n\t" "movaps (%[src2]), %%xmm3\n\t" @@ -91,8 +91,8 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* "add $16, %[target1]\n\t" "add $16, %[target2]\n\t" "add $16, %[target3]\n\t" - "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -113,7 +113,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { int i = 0; @@ -133,4 +133,4 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short -#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h index 227a92303..cdd60235e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -61,7 +61,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int1 \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -148,11 +148,11 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, i \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h index 35d0e8be2..2b99e068e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -55,7 +55,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, co \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -117,4 +117,4 @@ static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, -#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h index bdf5fc162..cd2fabb52 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -84,11 +84,11 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h index 73c6f3390..a6951e967 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H -#define INCLUDED_volk_16ic_magnitude_16i_a16_H +#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H +#define INCLUDED_volk_16ic_magnitude_16i_a_H #include #include @@ -14,7 +14,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -85,7 +85,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -181,11 +181,11 @@ static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ +#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h index e4a9015b4..e73d405e0 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -99,11 +99,11 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h index 993445995..1630cb0ed 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -62,7 +62,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffe \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; @@ -123,4 +123,4 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuff -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h index a136c0535..35406e2cb 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -80,7 +80,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -170,11 +170,11 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVect \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h index f393c05c5..75c7ef0f3 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16u_byteswap_a16_H -#define INCLUDED_volk_16u_byteswap_a16_H +#ifndef INCLUDED_volk_16u_byteswap_a_H +#define INCLUDED_volk_16u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ unsigned int number = 0; uint16_t* inputPtr = intsToSwap; __m128i input, left, right, output; @@ -49,7 +49,7 @@ static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){ unsigned int point; uint16_t* inputPtr = intsToSwap; for(point = 0; point < num_points; point++){ @@ -67,11 +67,11 @@ static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16u_byteswap_a16_H */ +#endif /* INCLUDED_volk_16u_byteswap_a_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h index dd24a1e29..7ce0d1c80 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H -#define INCLUDED_volk_32f_accumulator_s32f_a16_H +#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H +#define INCLUDED_volk_32f_accumulator_s32f_a_H #include #include @@ -13,7 +13,7 @@ \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){ const float* aPtr = inputBuffer; unsigned int number = 0; float returnValue = 0; @@ -65,4 +65,4 @@ static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const fl -#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ +#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h index 8ca83220b..dda646409 100644 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a16_H -#define INCLUDED_volk_32f_convert_64f_a16_H +#ifndef INCLUDED_volk_32f_convert_64f_a_H +#define INCLUDED_volk_32f_convert_64f_a_H #include #include @@ -12,7 +12,7 @@ \param fVector The float vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ +static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const flo \param fVector The float vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ +static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ double* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -67,4 +67,4 @@ static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const -#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ +#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h index af1f35348..3e0cf1d65 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_index_max_16u_a16_H -#define INCLUDED_volk_32f_index_max_16u_a16_H +#ifndef INCLUDED_volk_32f_index_max_16u_a_H +#define INCLUDED_volk_32f_index_max_16u_a_H #include #include @@ -9,7 +9,7 @@ #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -67,7 +67,7 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const #ifdef LV_HAVE_SSE #include -static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl #endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; unsigned int index = 0; @@ -146,4 +146,4 @@ static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, cons #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ +#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h index 6efd21a37..b25df75a1 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H -#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H #include #include @@ -14,7 +14,7 @@ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_noints The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } @@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_points The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } @@ -117,4 +117,4 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVect -#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h index f5b388e6d..b1902a8c0 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H #include #include @@ -17,7 +17,7 @@ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ float sumMean = 0.0; unsigned int number; // find the sum (for mean), etc @@ -165,4 +165,4 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float -#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 4acd2e13e..0a2b4f0f2 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H -#define INCLUDED_volk_32f_s32f_convert_16i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_32f_s32f_convert_16i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -54,7 +54,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, con \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -94,7 +94,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -108,4 +108,4 @@ static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 3f5044313..aa370e614 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H -#define INCLUDED_volk_32f_s32f_convert_32i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H +#define INCLUDED_volk_32f_s32f_convert_32i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -50,7 +50,7 @@ static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -86,7 +86,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, con \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -126,7 +126,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -140,4 +140,4 @@ static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, -#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index c114ea38f..8d87a07d7 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H -#define INCLUDED_volk_32f_s32f_convert_8i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H +#define INCLUDED_volk_32f_s32f_convert_8i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -61,7 +61,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -101,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -115,4 +115,4 @@ static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, co -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/include/volk/volk_32f_s32f_normalize_a.h index e6195cd32..f5fd0d1db 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a.h +++ b/volk/include/volk/volk_32f_s32f_normalize_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H -#define INCLUDED_volk_32f_s32f_normalize_a16_H +#ifndef INCLUDED_volk_32f_s32f_normalize_a_H +#define INCLUDED_volk_32f_s32f_normalize_a_H #include #include @@ -12,7 +12,7 @@ \param num_points The number of values in vecBuffer \param scalar The scale value to be applied to each buffer value */ -static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; @@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; const float invScalar = 1.0 / scalar; @@ -68,14 +68,14 @@ static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const f \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); + volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h index ecff901e1..c4fa31bd1 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H -#define INCLUDED_volk_32f_s32f_power_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H +#define INCLUDED_volk_32f_s32f_power_32f_a_H #include #include @@ -19,7 +19,7 @@ \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -76,7 +76,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -141,4 +141,4 @@ static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const flo -#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h index c2b903657..881067bdc 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a_H #include #include @@ -15,7 +15,7 @@ \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ unsigned int number = 0; @@ -75,7 +75,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ unsigned int number = 0; @@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ const float* aPtr = inputBuffer; @@ -142,4 +142,4 @@ static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const flo -#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h index a9ce76f88..e44c73cfd 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H -#define INCLUDED_volk_32f_sqrt_32f_a16_H +#ifndef INCLUDED_volk_32f_sqrt_32f_a_H +#define INCLUDED_volk_32f_sqrt_32f_a_H #include #include @@ -13,7 +13,7 @@ \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVecto \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -59,19 +59,19 @@ static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aV #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); /*! \brief Sqrts the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ +#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h index 10d72e09d..3a82e3d2f 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -88,7 +88,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -167,4 +167,4 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, fl -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h index 2de6a6644..3bc83653b 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H -#define INCLUDED_volk_32f_x2_add_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_add_32f_a_H +#define INCLUDED_volk_32f_x2_add_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVec \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h index 1603e78de..52ddfae87 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H -#define INCLUDED_volk_32f_x2_divide_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H +#define INCLUDED_volk_32f_x2_divide_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* a \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,12 +71,12 @@ static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const floa \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 2cd974070..0c58f2ecf 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H #include #include @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; @@ -28,7 +28,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const fl #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -78,7 +78,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float #include -static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -125,7 +125,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float #include -static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -182,4 +182,4 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const flo #endif /*LV_HAVE_SSE4_1*/ -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h index f3731fa2a..1d4d2dbbd 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H -#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a_H #include #include @@ -13,7 +13,7 @@ \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ unsigned int number = 0; float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; @@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -72,4 +72,4 @@ static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVec -#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h index 60be6e36d..7948c458d 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H -#define INCLUDED_volk_32f_x2_max_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_max_32f_a_H +#define INCLUDED_volk_32f_x2_max_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVec \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -75,11 +75,11 @@ static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h index 3b8291531..d77134868 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H -#define INCLUDED_volk_32f_x2_min_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_min_32f_a_H +#define INCLUDED_volk_32f_x2_min_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVec \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -75,11 +75,11 @@ static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h index 885941abf..fae9a652f 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H -#define INCLUDED_volk_32f_x2_multiply_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H +#define INCLUDED_volk_32f_x2_multiply_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -52,7 +52,7 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -90,7 +90,7 @@ static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -110,11 +110,11 @@ static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const fl \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h index f7ad3fd18..cc02c3678 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -73,7 +73,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexV \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ int16_t* complexVectorPtr = (int16_t*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -153,4 +153,4 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* compl -#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h index c01f2c1f3..16cad008a 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H -#define INCLUDED_volk_32f_x2_subtract_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H +#define INCLUDED_volk_32f_x2_subtract_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const fl \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h index 6e446cbef..2ea8fa96d 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H #include #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { +static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { float result = 0.0; @@ -100,7 +100,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* sr #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { +static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { @@ -148,4 +148,4 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h index 846315a4a..b7350b9fa 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vectors containing the float values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; @@ -84,12 +84,12 @@ static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, c \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h index 3e7c3fa28..9de036ef4 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -72,4 +72,4 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, flo -#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h index 945a26742..29c369d9a 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; @@ -59,7 +59,7 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, doubl \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; @@ -75,4 +75,4 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, do -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h index 3c3fb2583..a1d0fd5d1 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; @@ -65,4 +65,4 @@ static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, c -#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h index 40c1a7a46..70a3b1971 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; @@ -63,4 +63,4 @@ static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, -#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 0ad1edbe9..312e034e2 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H -#define INCLUDED_volk_32fc_index_max_16u_a16_H +#ifndef INCLUDED_volk_32fc_index_max_16u_a_H +#define INCLUDED_volk_32fc_index_max_16u_a_H #include #include @@ -11,7 +11,7 @@ #include -static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { +static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { @@ -189,7 +189,7 @@ static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32f #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { +static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { float sq_dist = 0.0; float max = 0.0; unsigned int index = 0; @@ -212,4 +212,4 @@ static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_ #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ +#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h index 946190e41..f18e9bc0b 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H -#define INCLUDED_volk_32fc_magnitude_32f_a16_H +#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H +#define INCLUDED_volk_32fc_magnitude_32f_a_H #include #include @@ -13,7 +13,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -56,7 +56,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, cons \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -103,7 +103,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -122,11 +122,11 @@ static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, c \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h index 55b1b6c70..9304b0c28 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H -#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H #include #include @@ -19,7 +19,7 @@ \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* outPtr = outputVector; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* outPtr = outputVector; @@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.0 / normalizeFactor; @@ -155,4 +155,4 @@ static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, con -#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h index 2460039d2..1c17fb70c 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H #include #include @@ -14,7 +14,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* iBufferPtr = iBuffer; unsigned int number = 0; @@ -78,4 +78,4 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBu -#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h index f67ab0607..38fd609d3 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H #include #include @@ -15,7 +15,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -70,7 +70,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -149,11 +149,11 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVe \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h index 155b93ca2..ec1d7167f 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -1,12 +1,12 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a_H #include #include #include //! raise a complex float to a real float power -static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power){ const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); return mag*lv_cmake(cosf(arg), sinf(arg)); @@ -26,7 +26,7 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, c \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l #endif /* LV_HAVE_LIB_SIMDMATH */ for(;number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); } } #endif /* LV_HAVE_SSE */ @@ -94,13 +94,13 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; for(number = 0; number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); } } #endif /* LV_HAVE_GENERIC */ @@ -108,4 +108,4 @@ static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, con -#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h index 03da069c2..8d1959dae 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H -#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H #include #include @@ -19,7 +19,7 @@ \param normalizationFactor This value is divided against all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ const float* inputPtr = (const float*)complexFFTInput; float* destPtr = logPowerOutput; uint64_t number = 0; @@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut \param normalizationFactor This value is divided agains all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; @@ -123,4 +123,4 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPower -#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h index 5bcd7f7c4..fc635f171 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H #include #include @@ -20,7 +20,7 @@ \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ const float* inputPtr = (const float*)complexFFTInput; float* destPtr = logPowerOutput; uint64_t number = 0; @@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; @@ -131,4 +131,4 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(floa -#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h index f221237ff..a6c21336d 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #include #include @@ -9,7 +9,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { float * res = (float*) result; float * in = (float*) input; @@ -63,7 +63,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -204,7 +204,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul #endif #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -342,4 +342,4 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* re -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index 9657c8f6b..022a0a614 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H #include #include @@ -10,7 +10,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { float * res = (float*) result; float * in = (float*) input; @@ -60,7 +60,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { asm @@ -195,7 +195,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { asm volatile ( @@ -321,7 +321,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, cons #include -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { lv_32fc_t dotProduct; @@ -378,8 +378,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const #include -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); +static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now /* __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; @@ -466,4 +466,4 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, cons #endif /*LV_HAVE_SSE4_1*/ -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 72010b855..18dd092e8 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H #include #include @@ -15,7 +15,7 @@ \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -61,7 +61,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const lv_32fc_t* bPtr= bVector; @@ -81,9 +81,9 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -91,4 +91,4 @@ static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h index 910f51679..be7a4ffe9 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H #include #include @@ -10,7 +10,7 @@ #include #include -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -106,7 +106,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; int i = 0; @@ -123,4 +123,4 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(flo #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h index 551f3cb53..c21d00491 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H #include #include @@ -9,7 +9,7 @@ #include #include -static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -92,7 +92,7 @@ static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; int i = 0; @@ -109,4 +109,4 @@ static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32 #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h index b744c7197..558142869 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H -#define INCLUDED_volk_32i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H +#define INCLUDED_volk_32i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -55,7 +55,7 @@ static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int32_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -70,4 +70,4 @@ static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, co -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h index 4d50efd32..dcd63d98e 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H -#define INCLUDED_volk_32i_x2_and_32i_a16_H +#ifndef INCLUDED_volk_32i_x2_and_32i_a_H +#define INCLUDED_volk_32i_x2_and_32i_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32 \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ +#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h index 9edbdbafd..243e8178c 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H -#define INCLUDED_volk_32i_x2_or_32i_a16_H +#ifndef INCLUDED_volk_32i_x2_or_32i_a_H +#define INCLUDED_volk_32i_x2_or_32i_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* a \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_ \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ +#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h index dc5cedab9..b88848096 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32u_byteswap_a16_H -#define INCLUDED_volk_32u_byteswap_a16_H +#ifndef INCLUDED_volk_32u_byteswap_a_H +#define INCLUDED_volk_32u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){ unsigned int number = 0; uint32_t* inputPtr = intsToSwap; @@ -57,7 +57,7 @@ static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = intsToSwap; unsigned int point; @@ -74,4 +74,4 @@ static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned -#endif /* INCLUDED_volk_32u_byteswap_a16_H */ +#endif /* INCLUDED_volk_32u_byteswap_a_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/include/volk/volk_32u_popcnt_a.h index 0d8b48fd5..b72d605c6 100644 --- a/volk/include/volk/volk_32u_popcnt_a.h +++ b/volk/include/volk/volk_32u_popcnt_a.h @@ -7,7 +7,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) { // This is faster than a lookup table uint32_t retVal = value; @@ -27,7 +27,7 @@ static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t val #include -static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) { *ret = _mm_popcnt_u32(value); } diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h index cfcdbdc3a..2126e4f95 100644 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a16_H -#define INCLUDED_volk_64f_convert_32f_a16_H +#ifndef INCLUDED_volk_64f_convert_32f_a_H +#define INCLUDED_volk_64f_convert_32f_a_H #include #include @@ -12,7 +12,7 @@ \param fVector The double vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ +static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -50,7 +50,7 @@ static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const doub \param fVector The double vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ +static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ float* outputVectorPtr = outputVector; const double* inputVectorPtr = inputVector; unsigned int number = 0; @@ -64,4 +64,4 @@ static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const d -#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h index 21f488bf7..61a704c52 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H -#define INCLUDED_volk_64f_x2_max_64f_a16_H +#ifndef INCLUDED_volk_64f_x2_max_64f_a_H +#define INCLUDED_volk_64f_x2_max_64f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* a \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; @@ -68,4 +68,4 @@ static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ +#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h index 8711a0eae..148b72c59 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H -#define INCLUDED_volk_64f_x2_min_64f_a16_H +#ifndef INCLUDED_volk_64f_x2_min_64f_a_H +#define INCLUDED_volk_64f_x2_min_64f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* a \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; @@ -68,4 +68,4 @@ static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ +#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h index b4bed8451..d4fc74a6e 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64u_byteswap_a16_H -#define INCLUDED_volk_64u_byteswap_a16_H +#ifndef INCLUDED_volk_64u_byteswap_a_H +#define INCLUDED_volk_64u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = (uint32_t*)intsToSwap; __m128i input, byte1, byte2, byte3, byte4, output; __m128i byte2mask = _mm_set1_epi32(0x00FF0000); @@ -65,7 +65,7 @@ static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = (uint32_t*)intsToSwap; unsigned int point; for(point = 0; point < num_points; point++){ @@ -85,4 +85,4 @@ static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned -#endif /* INCLUDED_volk_64u_byteswap_a16_H */ +#endif /* INCLUDED_volk_64u_byteswap_a_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 8b92e91a1..bdaa98643 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64u_popcnt_a16_H -#define INCLUDED_volk_64u_popcnt_a16_H +#ifndef INCLUDED_volk_64u_popcnt_a_H +#define INCLUDED_volk_64u_popcnt_a_H #include #include @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { const uint32_t* valueVector = (const uint32_t*)&value; @@ -40,11 +40,11 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #include -static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) { *ret = _mm_popcnt_u64(value); } #endif /*LV_HAVE_SSE4_2*/ -#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ +#endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/include/volk/volk_8i_convert_16i_a.h index 260ac40a1..9104f90cb 100644 --- a/volk/include/volk/volk_8i_convert_16i_a.h +++ b/volk/include/volk/volk_8i_convert_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8i_convert_16i_a16_H -#define INCLUDED_volk_8i_convert_16i_a16_H +#ifndef INCLUDED_volk_8i_convert_16i_a_H +#define INCLUDED_volk_8i_convert_16i_a_H #include #include @@ -13,7 +13,7 @@ \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -54,7 +54,7 @@ static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const i \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const int8_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -72,9 +72,9 @@ static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h index 9991b150e..7f2623ac6 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H -#define INCLUDED_volk_8i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H +#define INCLUDED_volk_8i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -74,7 +74,7 @@ static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, cons \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int8_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -94,10 +94,10 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); + volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h index 249acab49..8f13da32f 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -74,4 +74,4 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, in -#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h index 7b64b37c5..d26b3d0d0 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -63,4 +63,4 @@ static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, -#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h index a1abad487..21efed83e 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, cons \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h index 7d778796e..b723c6f8b 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -85,7 +85,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -162,4 +162,4 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h index a2e0cd8de..74073f5a6 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -71,7 +71,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; @@ -131,4 +131,4 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffe -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 7307ae484..0bb76f1d1 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H -#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H #include #include @@ -14,7 +14,7 @@ \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; int16_t* c16Ptr = (int16_t*)cVector; int8_t* a8Ptr = (int8_t*)aVector; @@ -98,4 +98,4 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cV -#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index adc7c0599..3e05608a4 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H #include #include @@ -14,7 +14,7 @@ \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; float* cPtr = (float*)cVector; const float invScalar = 1.0 / scalar; @@ -119,4 +119,4 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_ -#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */ -- cgit From c80e7b00de836a388013a002008c7256481414cb Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:39:57 -0700 Subject: Volk: renamed everything else to _a instead of _a16, makefiles included --- volk/include/volk/Makefile.am | 152 +++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 76 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 5f9e134bc..b7da9b37c 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -33,91 +33,91 @@ volkinclude_HEADERS = \ $(top_gendir)/include/volk/volk_typedefs.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ - volk_16i_x5_add_quad_16i_x4_a16.h \ - volk_16i_branch_4_state_8_a16.h \ - volk_16ic_deinterleave_16i_x2_a16.h \ - volk_16ic_s32f_deinterleave_32f_x2_a16.h \ - volk_16ic_deinterleave_real_16i_a16.h \ - volk_16ic_s32f_deinterleave_real_32f_a16.h \ - volk_16ic_deinterleave_real_8i_a16.h \ - volk_16ic_magnitude_16i_a16.h \ - volk_16ic_s32f_magnitude_32f_a16.h \ - volk_16i_s32f_convert_32f_a16.h \ + volk_16i_x5_add_quad_16i_x4_a.h \ + volk_16i_branch_4_state_8_a.h \ + volk_16ic_deinterleave_16i_x2_a.h \ + volk_16ic_s32f_deinterleave_32f_x2_a.h \ + volk_16ic_deinterleave_real_16i_a.h \ + volk_16ic_s32f_deinterleave_real_32f_a.h \ + volk_16ic_deinterleave_real_8i_a.h \ + volk_16ic_magnitude_16i_a.h \ + volk_16ic_s32f_magnitude_32f_a.h \ + volk_16i_s32f_convert_32f_a.h \ volk_16i_s32f_convert_32f_u.h \ - volk_16i_convert_8i_a16.h \ + volk_16i_convert_8i_a.h \ volk_16i_convert_8i_u.h \ - volk_16i_max_star_16i_a16.h \ - volk_16i_max_star_horizontal_16i_a16.h \ - volk_16i_permute_and_scalar_add_a16.h \ - volk_16i_x4_quad_max_star_16i_a16.h \ - volk_16u_byteswap_a16.h \ - volk_32f_accumulator_s32f_a16.h \ - volk_32f_x2_add_32f_a16.h \ - volk_32fc_32f_multiply_32fc_a16.h \ - volk_32fc_s32f_power_32fc_a16.h \ - volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ - volk_32fc_s32f_atan2_32f_a16.h \ - volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_16i_max_star_16i_a.h \ + volk_16i_max_star_horizontal_16i_a.h \ + volk_16i_permute_and_scalar_add_a.h \ + volk_16i_x4_quad_max_star_16i_a.h \ + volk_16u_byteswap_a.h \ + volk_32f_accumulator_s32f_a.h \ + volk_32f_x2_add_32f_a.h \ + volk_32fc_32f_multiply_32fc_a.h \ + volk_32fc_s32f_power_32fc_a.h \ + volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ + volk_32fc_s32f_atan2_32f_a.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_a.h \ volk_32fc_x2_conjugate_dot_prod_32fc_u.h \ - volk_32fc_deinterleave_32f_x2_a16.h \ - volk_32fc_deinterleave_64f_x2_a16.h \ - volk_32fc_s32f_deinterleave_real_16i_a16.h \ - volk_32fc_deinterleave_real_32f_a16.h \ - volk_32fc_deinterleave_real_64f_a16.h \ - volk_32fc_x2_dot_prod_32fc_a16.h \ - volk_32fc_index_max_16u_a16.h \ - volk_32fc_s32f_magnitude_16i_a16.h \ - volk_32fc_magnitude_32f_a16.h \ - volk_32fc_x2_multiply_32fc_a16.h \ - volk_32f_s32f_convert_16i_a16.h \ + volk_32fc_deinterleave_32f_x2_a.h \ + volk_32fc_deinterleave_64f_x2_a.h \ + volk_32fc_s32f_deinterleave_real_16i_a.h \ + volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_real_64f_a.h \ + volk_32fc_x2_dot_prod_32fc_a.h \ + volk_32fc_index_max_16u_a.h \ + volk_32fc_s32f_magnitude_16i_a.h \ + volk_32fc_magnitude_32f_a.h \ + volk_32fc_x2_multiply_32fc_a.h \ + volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ - volk_32f_s32f_convert_32i_a16.h \ + volk_32f_s32f_convert_32i_a.h \ volk_32f_s32f_convert_32i_u.h \ - volk_32f_convert_64f_a16.h \ + volk_32f_convert_64f_a.h \ volk_32f_convert_64f_u.h \ - volk_32f_s32f_convert_8i_a16.h \ + volk_32f_s32f_convert_8i_a.h \ volk_32f_s32f_convert_8i_u.h \ - volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \ - volk_32fc_s32f_power_spectrum_32f_a16.h \ - volk_32fc_x2_square_dist_32f_a16.h \ - volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \ - volk_32f_x2_divide_32f_a16.h \ - volk_32f_x2_dot_prod_32f_a16.h \ + volk_32fc_s32f_x2_power_spectral_density_32f_a.h \ + volk_32fc_s32f_power_spectrum_32f_a.h \ + volk_32fc_x2_square_dist_32f_a.h \ + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h \ + volk_32f_x2_divide_32f_a.h \ + volk_32f_x2_dot_prod_32f_a.h \ volk_32f_x2_dot_prod_32f_u.h \ - volk_32f_s32f_32f_fm_detect_32f_a16.h \ - volk_32f_index_max_16u_a16.h \ - volk_32f_x2_s32f_interleave_16ic_a16.h \ - volk_32f_x2_interleave_32fc_a16.h \ - volk_32f_x2_max_32f_a16.h \ - volk_32f_x2_min_32f_a16.h \ - volk_32f_x2_multiply_32f_a16.h \ - volk_32f_s32f_normalize_a16.h \ - volk_32f_s32f_power_32f_a16.h \ - volk_32f_sqrt_32f_a16.h \ - volk_32f_s32f_stddev_32f_a16.h \ - volk_32f_stddev_and_mean_32f_x2_a16.h \ - volk_32f_x2_subtract_32f_a16.h \ - volk_32f_x3_sum_of_poly_32f_a16.h \ - volk_32i_x2_and_32i_a16.h \ - volk_32i_s32f_convert_32f_a16.h \ + volk_32f_s32f_32f_fm_detect_32f_a.h \ + volk_32f_index_max_16u_a.h \ + volk_32f_x2_s32f_interleave_16ic_a.h \ + volk_32f_x2_interleave_32fc_a.h \ + volk_32f_x2_max_32f_a.h \ + volk_32f_x2_min_32f_a.h \ + volk_32f_x2_multiply_32f_a.h \ + volk_32f_s32f_normalize_a.h \ + volk_32f_s32f_power_32f_a.h \ + volk_32f_sqrt_32f_a.h \ + volk_32f_s32f_stddev_32f_a.h \ + volk_32f_stddev_and_mean_32f_x2_a.h \ + volk_32f_x2_subtract_32f_a.h \ + volk_32f_x3_sum_of_poly_32f_a.h \ + volk_32i_x2_and_32i_a.h \ + volk_32i_s32f_convert_32f_a.h \ volk_32i_s32f_convert_32f_u.h \ - volk_32i_x2_or_32i_a16.h \ - volk_32u_byteswap_a16.h \ - volk_32u_popcnt_a16.h \ - volk_64f_convert_32f_a16.h \ + volk_32i_x2_or_32i_a.h \ + volk_32u_byteswap_a.h \ + volk_32u_popcnt_a.h \ + volk_64f_convert_32f_a.h \ volk_64f_convert_32f_u.h \ - volk_64f_x2_max_64f_a16.h \ - volk_64f_x2_min_64f_a16.h \ - volk_64u_byteswap_a16.h \ - volk_64u_popcnt_a16.h \ - volk_8ic_deinterleave_16i_x2_a16.h \ - volk_8ic_s32f_deinterleave_32f_x2_a16.h \ - volk_8ic_deinterleave_real_16i_a16.h \ - volk_8ic_s32f_deinterleave_real_32f_a16.h \ - volk_8ic_deinterleave_real_8i_a16.h \ - volk_8ic_x2_multiply_conjugate_16ic_a16.h \ - volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \ - volk_8i_convert_16i_a16.h \ + volk_64f_x2_max_64f_a.h \ + volk_64f_x2_min_64f_a.h \ + volk_64u_byteswap_a.h \ + volk_64u_popcnt_a.h \ + volk_8ic_deinterleave_16i_x2_a.h \ + volk_8ic_s32f_deinterleave_32f_x2_a.h \ + volk_8ic_deinterleave_real_16i_a.h \ + volk_8ic_s32f_deinterleave_real_32f_a.h \ + volk_8ic_deinterleave_real_8i_a.h \ + volk_8ic_x2_multiply_conjugate_16ic_a.h \ + volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h \ + volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ - volk_8i_s32f_convert_32f_a16.h \ + volk_8i_s32f_convert_32f_a.h \ volk_8i_s32f_convert_32f_u.h -- cgit From ccfac187cf86122be5760cb1e2a0bba3a58821c7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 20 Oct 2011 10:46:35 -0700 Subject: volk: fixed signed/unsigned comparison warnings. --- volk/include/volk/volk_32f_index_max_16u_a.h | 2 +- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 2 +- volk/include/volk/volk_32fc_index_max_16u_a.h | 2 +- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 2 +- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 2 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 2 +- volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 2 +- volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h index 3e0cf1d65..0c43a5081 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -129,7 +129,7 @@ static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float max = src0[0]; unsigned int index = 0; - int i = 1; + unsigned int i = 1; for(; i < num_points; ++i) { diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h index 2ea8fa96d..153bb3a25 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -113,7 +113,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* s - int i = 0; + unsigned int i = 0; for(; i < num_bytes >> 2; ++i) { fst = src0[i]; diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 312e034e2..9566aa32e 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -194,7 +194,7 @@ static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32 float max = 0.0; unsigned int index = 0; - int i = 0; + unsigned int i = 0; for(; i < num_bytes >> 3; ++i) { diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h index a6c21336d..655075528 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -21,7 +21,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res float sum0[2] = {0,0}; float sum1[2] = {0,0}; - int i = 0; + unsigned int i = 0; for(i = 0; i < n_2_ccomplex_blocks; ++i) { diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 6b22d9f81..f11c93682 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -20,7 +20,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res float sum0[2] = {0,0}; float sum1[2] = {0,0}; - int i = 0; + unsigned int i = 0; for(i = 0; i < n_2_ccomplex_blocks; ++i) { diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index 022a0a614..a865e0737 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -22,7 +22,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const float sum0[2] = {0,0}; float sum1[2] = {0,0}; - int i = 0; + unsigned int i = 0; for(i = 0; i < n_2_ccomplex_blocks; ++i) { diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h index be7a4ffe9..2d5f36b27 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -109,7 +109,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - int i = 0; + unsigned int i = 0; for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h index c21d00491..6a4a08ca5 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -95,7 +95,7 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - int i = 0; + unsigned int i = 0; for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; -- cgit From ab6f8142da17ee70effd469f20a41821b4bc4513 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Thu, 20 Oct 2011 14:00:14 -0700 Subject: removes gcc warning, dont need symbol export on plain c structs --- volk/include/volk/volk_prefs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 2a7f7e79f..83d9baf89 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -5,7 +5,7 @@ __VOLK_DECL_BEGIN -struct VOLK_API volk_arch_pref { +struct volk_arch_pref { char name[128]; char arch[32]; }; -- cgit From 52c51c983d51ff725238c22571b2d466875a5f22 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 14 Nov 2011 11:30:59 -0800 Subject: volk: conversion tweaks to build avx on MSVC --- volk/include/volk/volk_16i_max_star_horizontal_16i_a.h | 6 +++--- volk/include/volk/volk_32fc_index_max_16u_a.h | 4 ++-- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 8 ++++---- volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 4 ++-- volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 4 ++-- volk/include/volk/volk_common.h | 2 ++ 6 files changed, 15 insertions(+), 13 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index f60b33a41..a10a62350 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H +#include #include #include @@ -21,7 +22,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i xmm5, xmm6, xmm7, xmm8; xmm4 = _mm_load_si128((__m128i*)shufmask0); @@ -92,8 +93,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); + _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); p_target = (__m128i*)((int8_t*)p_target + 8); diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 9566aa32e..125a34582 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -87,8 +87,8 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm2 = _mm_load_ps((float*)src0); - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; xmm2 = _mm_mul_ps(xmm2, xmm2); diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index f11c93682..02faf86c2 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -96,9 +96,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result in1 = _mm_loadu_ps( (float*) (input+offset) ); in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = in1*in2; + Rv = _mm_mul_ps(in1, in2); fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = in1*fehg; + Iv = _mm_mul_ps(in1, fehg); Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); Ivm = _mm_xor_ps( negMask.vec, Iv ); Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); @@ -119,9 +119,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result in1 = _mm_loadu_ps( (float*) (input+offset) ); in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(in1*in2, halfMask.vec); + Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); Ivm = _mm_xor_ps( negMask.vec, Iv ); Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 0bb76f1d1..0c280eb6e 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -26,8 +26,8 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index 3e05608a4..a2c2b04f6 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -29,8 +29,8 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 2c935d1fb..38263d5f7 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -91,4 +91,6 @@ union bit128{ #endif }; +#define bit128_p(x) ((union bit128 *)(x)) + #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From 488f0c614f0e951314c686e7f168bc62cea250d5 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 9 Nov 2011 11:32:47 -0800 Subject: Volk: added 32fc x scalar multiply, implemented in Orc & generic. Orc/SSE tested 10x faster than generic. --- .../include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 46 ++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h new file mode 100644 index 000000000..b27a7259f --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -0,0 +1,46 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); +static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ -- cgit From 7695ca06f1b565723f3fc302d7b7818e93d86a57 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 9 Nov 2011 12:46:00 -0800 Subject: Volk: 32f_s32f_multiply_32f --- volk/include/volk/volk_32f_s32f_multiply_32f_a.h | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 volk/include/volk/volk_32f_s32f_multiply_32f_a.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h new file mode 100644 index 000000000..6aef1735c --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -0,0 +1,44 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H +#define INCLUDED_volk_32f_s32f_multiply_32f_a_H + +#include +#include + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, float* aVector, const float scalar, unsigned int num_points){ + volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */ -- cgit From a6b5389139341784edf30d512aa42360d3a8cf02 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Thu, 1 Dec 2011 09:21:11 -0500 Subject: volk: squashed changes 32f_s32f_multiply_32f_a 32fc_x2_dot_prod_32fc_u --- volk/include/volk/volk_32f_s32f_multiply_32f_a.h | 8 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h | 116 +++++++++++++++++++++++ 2 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index 6aef1735c..37223dc81 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -12,9 +12,9 @@ \param scalar the scalar value \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, float* aVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ unsigned int number = 0; - float* inputPtr = aVector; + const float* inputPtr = aVector; float* outputPtr = cVector; for(number = 0; number < num_points; number++){ *outputPtr = (*inputPtr) * scalar; @@ -32,8 +32,8 @@ static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, float* a \param scalar the scalar value \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, float* aVector, const float scalar, unsigned int num_points){ +extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h new file mode 100644 index 000000000..7c0dba7fd --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h @@ -0,0 +1,116 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[num_points - 1] * taps[num_points - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_points/2; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(num_points % 1 != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ -- cgit From 8cc3b8984c1a71ccbb8a33534571c26ee7447587 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 5 Dec 2011 11:00:34 -0500 Subject: volk: updated autotools Makefile to build with new headers. --- volk/include/volk/Makefile.am | 3 +++ 1 file changed, 3 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index b7da9b37c..e7333a015 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,7 +53,9 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_s32f_multiply_32f_a.h \ volk_32fc_32f_multiply_32fc_a.h \ + volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ @@ -65,6 +67,7 @@ volkinclude_HEADERS = \ volk_32fc_deinterleave_real_32f_a.h \ volk_32fc_deinterleave_real_64f_a.h \ volk_32fc_x2_dot_prod_32fc_a.h \ + volk_32fc_x2_dot_prod_32fc_u.h \ volk_32fc_index_max_16u_a.h \ volk_32fc_s32f_magnitude_16i_a.h \ volk_32fc_magnitude_32f_a.h \ -- cgit From 00420d32081d8252bb37142b2be19a8a7c4dc4c4 Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Thu, 8 Dec 2011 13:48:48 -0800 Subject: Removed autotools, gr-waveform, some cleanup Nick Foster owes Nick Corgan a six-pack of beer! --- volk/include/.gitignore | 10 ---- volk/include/Makefile.am | 23 -------- volk/include/volk/.gitignore | 2 - volk/include/volk/Makefile.am | 126 ------------------------------------------ 4 files changed, 161 deletions(-) delete mode 100644 volk/include/.gitignore delete mode 100644 volk/include/Makefile.am delete mode 100644 volk/include/volk/.gitignore delete mode 100644 volk/include/volk/Makefile.am (limited to 'volk/include') diff --git a/volk/include/.gitignore b/volk/include/.gitignore deleted file mode 100644 index 378f771f5..000000000 --- a/volk/include/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -/*.cache -/*.la -/*.lo -/*.pc -/.deps -/.la -/.libs -/.lo -/Makefile -/Makefile.in diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am deleted file mode 100644 index 375d1a7d5..000000000 --- a/volk/include/Makefile.am +++ /dev/null @@ -1,23 +0,0 @@ -# -# Copyright 2008 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# - -include $(top_srcdir)/Makefile.common - -SUBDIRS = volk diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore deleted file mode 100644 index b336cc7ce..000000000 --- a/volk/include/volk/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/Makefile -/Makefile.in diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am deleted file mode 100644 index e7333a015..000000000 --- a/volk/include/volk/Makefile.am +++ /dev/null @@ -1,126 +0,0 @@ -# -# Copyright 2010,2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# - -include $(top_srcdir)/Makefile.common - -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ - $(LV_CXXFLAGS) $(WITH_INCLUDES) - -volkincludedir = $(prefix)/include/volk - -volkinclude_HEADERS = \ - volk_complex.h \ - volk_common.h \ - volk_prefs.h \ - $(top_gendir)/include/volk/volk_config_fixed.h \ - $(top_gendir)/include/volk/volk_typedefs.h \ - $(top_gendir)/include/volk/volk.h \ - $(top_gendir)/include/volk/volk_cpu.h \ - volk_16i_x5_add_quad_16i_x4_a.h \ - volk_16i_branch_4_state_8_a.h \ - volk_16ic_deinterleave_16i_x2_a.h \ - volk_16ic_s32f_deinterleave_32f_x2_a.h \ - volk_16ic_deinterleave_real_16i_a.h \ - volk_16ic_s32f_deinterleave_real_32f_a.h \ - volk_16ic_deinterleave_real_8i_a.h \ - volk_16ic_magnitude_16i_a.h \ - volk_16ic_s32f_magnitude_32f_a.h \ - volk_16i_s32f_convert_32f_a.h \ - volk_16i_s32f_convert_32f_u.h \ - volk_16i_convert_8i_a.h \ - volk_16i_convert_8i_u.h \ - volk_16i_max_star_16i_a.h \ - volk_16i_max_star_horizontal_16i_a.h \ - volk_16i_permute_and_scalar_add_a.h \ - volk_16i_x4_quad_max_star_16i_a.h \ - volk_16u_byteswap_a.h \ - volk_32f_accumulator_s32f_a.h \ - volk_32f_x2_add_32f_a.h \ - volk_32f_s32f_multiply_32f_a.h \ - volk_32fc_32f_multiply_32fc_a.h \ - volk_32fc_s32fc_multiply_32fc_a.h \ - volk_32fc_s32f_power_32fc_a.h \ - volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ - volk_32fc_s32f_atan2_32f_a.h \ - volk_32fc_x2_conjugate_dot_prod_32fc_a.h \ - volk_32fc_x2_conjugate_dot_prod_32fc_u.h \ - volk_32fc_deinterleave_32f_x2_a.h \ - volk_32fc_deinterleave_64f_x2_a.h \ - volk_32fc_s32f_deinterleave_real_16i_a.h \ - volk_32fc_deinterleave_real_32f_a.h \ - volk_32fc_deinterleave_real_64f_a.h \ - volk_32fc_x2_dot_prod_32fc_a.h \ - volk_32fc_x2_dot_prod_32fc_u.h \ - volk_32fc_index_max_16u_a.h \ - volk_32fc_s32f_magnitude_16i_a.h \ - volk_32fc_magnitude_32f_a.h \ - volk_32fc_x2_multiply_32fc_a.h \ - volk_32f_s32f_convert_16i_a.h \ - volk_32f_s32f_convert_16i_u.h \ - volk_32f_s32f_convert_32i_a.h \ - volk_32f_s32f_convert_32i_u.h \ - volk_32f_convert_64f_a.h \ - volk_32f_convert_64f_u.h \ - volk_32f_s32f_convert_8i_a.h \ - volk_32f_s32f_convert_8i_u.h \ - volk_32fc_s32f_x2_power_spectral_density_32f_a.h \ - volk_32fc_s32f_power_spectrum_32f_a.h \ - volk_32fc_x2_square_dist_32f_a.h \ - volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h \ - volk_32f_x2_divide_32f_a.h \ - volk_32f_x2_dot_prod_32f_a.h \ - volk_32f_x2_dot_prod_32f_u.h \ - volk_32f_s32f_32f_fm_detect_32f_a.h \ - volk_32f_index_max_16u_a.h \ - volk_32f_x2_s32f_interleave_16ic_a.h \ - volk_32f_x2_interleave_32fc_a.h \ - volk_32f_x2_max_32f_a.h \ - volk_32f_x2_min_32f_a.h \ - volk_32f_x2_multiply_32f_a.h \ - volk_32f_s32f_normalize_a.h \ - volk_32f_s32f_power_32f_a.h \ - volk_32f_sqrt_32f_a.h \ - volk_32f_s32f_stddev_32f_a.h \ - volk_32f_stddev_and_mean_32f_x2_a.h \ - volk_32f_x2_subtract_32f_a.h \ - volk_32f_x3_sum_of_poly_32f_a.h \ - volk_32i_x2_and_32i_a.h \ - volk_32i_s32f_convert_32f_a.h \ - volk_32i_s32f_convert_32f_u.h \ - volk_32i_x2_or_32i_a.h \ - volk_32u_byteswap_a.h \ - volk_32u_popcnt_a.h \ - volk_64f_convert_32f_a.h \ - volk_64f_convert_32f_u.h \ - volk_64f_x2_max_64f_a.h \ - volk_64f_x2_min_64f_a.h \ - volk_64u_byteswap_a.h \ - volk_64u_popcnt_a.h \ - volk_8ic_deinterleave_16i_x2_a.h \ - volk_8ic_s32f_deinterleave_32f_x2_a.h \ - volk_8ic_deinterleave_real_16i_a.h \ - volk_8ic_s32f_deinterleave_real_32f_a.h \ - volk_8ic_deinterleave_real_8i_a.h \ - volk_8ic_x2_multiply_conjugate_16ic_a.h \ - volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h \ - volk_8i_convert_16i_a.h \ - volk_8i_convert_16i_u.h \ - volk_8i_s32f_convert_32f_a.h \ - volk_8i_s32f_convert_32f_u.h -- cgit From fe1e9b77d7f4b48af3ab309321dc79c74e79a290 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 11 Dec 2011 13:13:39 -0500 Subject: volk: fixed some of the simpler warnings. --- volk/include/volk/volk_32f_s32f_power_32f_a.h | 4 ++-- volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 4 ++-- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 2 +- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h index c4fa31bd1..09c905961 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -21,12 +21,12 @@ */ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; float* cPtr = cVector; const float* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH + const unsigned int quarterPoints = num_points / 4; __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; @@ -78,12 +78,12 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* */ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; float* cPtr = cVector; const float* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH + const unsigned int quarterPoints = num_points / 4; __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h index 9304b0c28..7bd001aa0 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -24,10 +24,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const float* outPtr = outputVector; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -86,10 +86,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv float* outPtr = outputVector; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h index ec1d7167f..588b532b4 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -28,12 +28,12 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con */ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH + const unsigned int quarterPoints = num_points / 4; __m128 vPower = _mm_set_ps1(power); __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 02faf86c2..3ae7208a8 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -66,7 +66,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + // Variable never used? + //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; union HalfMask { uint32_t intRep[4]; -- cgit From 1561b416702a26d53f0bbd5879925534c0ff82b2 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 6 Jan 2012 14:00:03 -0500 Subject: volk: minor changes to fix some warnings. --- volk/include/volk/volk_16i_max_star_16i_a.h | 4 ++-- volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 6 +++--- volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h index 6a4f63708..28197ddef 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -18,7 +18,7 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un short candidate = src0[0]; short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; __m128i *p_src0; @@ -41,7 +41,7 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un for(i = 0; i < bound; ++i) { xmm1 = _mm_load_si128(p_src0); p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); + //xmm2 = _mm_sub_epi16(xmm1, xmm0); diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h index 881067bdc..75fe0cb2e 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -60,7 +60,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* } returnValue /= num_points; returnValue -= (mean * mean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; } @@ -106,7 +106,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in } returnValue /= num_points; returnValue -= (mean * mean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; } @@ -133,7 +133,7 @@ static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float returnValue /= num_points; returnValue -= (mean * mean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; } diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h index 3a82e3d2f..20ff676d8 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -72,7 +72,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float newMean /= num_points; returnValue /= num_points; returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; *mean = newMean; @@ -128,7 +128,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m newMean /= num_points; returnValue /= num_points; returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; *mean = newMean; @@ -157,7 +157,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, floa newMean /= num_points; returnValue /= num_points; returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); + returnValue = sqrtf(returnValue); } *stddev = returnValue; *mean = newMean; -- cgit From d825bb2bc1c7bc4105723ae5f699f283bc4f3c44 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 17:53:49 -0500 Subject: volk: float_to_short now clips the values instead of wrapping around. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 52 +++++++++++++++++++++---- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 52 +++++++++++++++++++++---- 2 files changed, 90 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 0a2b4f0f2..10c921b08 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -61,8 +78,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -70,7 +94,8 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); @@ -81,7 +106,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -98,9 +128,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index dec3f1611..f339a7d10 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -62,8 +79,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -71,7 +95,8 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); @@ -82,7 +107,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -100,9 +130,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From d8b02979cef097971bc0656b904f7b51d19b03c9 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 20:07:26 -0500 Subject: volk: fix a warning. --- volk/include/volk/volk_64u_popcnt_a.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index bdaa98643..4683f1e38 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -10,10 +10,11 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { - const uint32_t* valueVector = (const uint32_t*)&value; + //const uint32_t* valueVector = (const uint32_t*)&value; // This is faster than a lookup table - uint32_t retVal = valueVector[0]; + //uint32_t retVal = valueVector[0]; + uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -22,7 +23,8 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value retVal = (retVal + (retVal >> 16)) & 0x0000003F; uint64_t retVal64 = retVal; - retVal = valueVector[1]; + //retVal = valueVector[1]; + retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; -- cgit From 42d9560a50bbbab143b48bda5a73a5379818ddbe Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 20:07:51 -0500 Subject: volk: float_to_int and float_to_char updated to clip instead of wrap around. The float to int clips at smaller than 2^32 because of the limits of the float representation. --- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 60 ++++++++++++++++++++++--- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 45 ++++++++++++++++--- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 53 ++++++++++++++++++---- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 53 ++++++++++++++++++---- 4 files changed, 183 insertions(+), 28 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index aa370e614..15fa282fb 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -21,14 +21,22 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m256 vScalar = _mm256_set1_ps(scalar); __m256 inputVal1; __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; @@ -36,7 +44,12 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_AVX */ @@ -57,14 +70,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -72,7 +93,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -93,8 +119,15 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -102,7 +135,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -113,7 +146,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -130,9 +168,17 @@ static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index b4e954dc4..d8493454b 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -21,14 +21,24 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + //float min_val = -2147483647; + //float max_val = 2147483647; + float min_val = -2146400000; + float max_val = 2146400000; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -36,7 +46,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -58,8 +73,15 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -67,7 +89,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -78,7 +100,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -96,9 +123,17 @@ static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 8d87a07d7..05172171c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -67,9 +84,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + int8_t* outputVectorPtr = outputVector; __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -77,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -88,7 +112,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -105,9 +134,17 @@ static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 1c6bf87c9..12991e9c1 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -69,8 +86,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -78,7 +102,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -89,7 +113,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -107,9 +136,17 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From d870487437b33d1a26eb8f5f5aaa414ca6ed24e0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 28 Jan 2012 19:31:33 -0500 Subject: volk: fix lower bound of int conversion. --- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 15fa282fb..8f2fc791e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -22,7 +22,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483647; + float min_val = -2147483648; float max_val = 2147483647; float r; @@ -71,7 +71,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483647; + float min_val = -2147483648; float max_val = 2147483647; float r; -- cgit From 9c9b9a8aa05c7cde7cd11cd5d1e052630715d252 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 29 Jan 2012 17:31:37 -0500 Subject: volk: added unaligned volk function for magnitude of a complex number. --- volk/include/volk/volk_32fc_magnitude_32f_u.h | 118 ++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h new file mode 100644 index 000000000..ed1cedef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ -- cgit From d142fd9e31715bae85d65c4790f278143a784142 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 29 Jan 2012 17:32:30 -0500 Subject: volk: added volk magnitiude squared functions (aligned/unaligned) for complex numbers. --- volk/include/volk/Makefile.am | 3 + .../volk/volk_32fc_magnitude_squared_32f_a.h | 114 +++++++++++++++++++++ .../volk/volk_32fc_magnitude_squared_32f_u.h | 114 +++++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_a.h create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index e7333a015..c2502f6e6 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -71,6 +71,9 @@ volkinclude_HEADERS = \ volk_32fc_index_max_16u_a.h \ volk_32fc_s32f_magnitude_16i_a.h \ volk_32fc_magnitude_32f_a.h \ + volk_32fc_magnitude_32f_u.h \ + volk_32fc_magnitude_squared_32f_a.h \ + volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h new file mode 100644 index 000000000..00bdefbb5 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h new file mode 100644 index 000000000..6eb4a523a --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ -- cgit From cb458204245632ac7a571bfe96b90d3c55a2f01a Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 30 Jan 2012 00:22:44 -0500 Subject: volk: adding complex to imag kernel. --- volk/include/volk/Makefile.am | 1 + .../volk/volk_32fc_deinterleave_imag_32f_a.h | 68 ++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index c2502f6e6..c5b99c41b 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -65,6 +65,7 @@ volkinclude_HEADERS = \ volk_32fc_deinterleave_64f_x2_a.h \ volk_32fc_s32f_deinterleave_real_16i_a.h \ volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_imag_32f_a.h \ volk_32fc_deinterleave_real_64f_a.h \ volk_32fc_x2_dot_prod_32fc_a.h \ volk_32fc_x2_dot_prod_32fc_u.h \ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h new file mode 100644 index 000000000..adc4112b9 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in q1q2q3q4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(qBufferPtr, iValue); + + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + for(number = 0; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */ -- cgit From 070a6c9ce93c2f2043a0145e253cc55f801f9433 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 14:25:53 -0500 Subject: volk: adding unaligned versions of complex multiply a constant and complex multiply 2 streams. --- .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 42 ++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_u.h | 77 ++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h new file mode 100644 index 000000000..201dcf5f6 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -0,0 +1,42 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h new file mode 100644 index 000000000..729c1a4ad --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ -- cgit From 67d23bdecd3f15197bdf46f8c0cd66a6f754fea5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 16:35:14 -0500 Subject: volk: improving performance of multiply_const and multiply two streams. --- .../include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 63 +++++++++++++++++++++- .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 45 ++++++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 1 - 3 files changed, 106 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..205461afb 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -6,6 +6,52 @@ #include #include +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + + #ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector @@ -17,11 +63,24 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; + unsigned int number = num_points; - for(number = 0; number < num_points; number++){ + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 201dcf5f6..a9dfcda19 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -6,6 +6,51 @@ #include #include +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 18dd092e8..aec8bd716 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -23,7 +23,6 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi -- cgit From b3e538493f969526c704a3eb1b3c8d8c61be78f6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 16:38:36 -0500 Subject: volk: adding new functions to Makefile. --- volk/include/volk/Makefile.am | 2 ++ 1 file changed, 2 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index c5b99c41b..eea006bf2 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -56,6 +56,7 @@ volkinclude_HEADERS = \ volk_32f_s32f_multiply_32f_a.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ + volk_32fc_s32fc_multiply_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ @@ -76,6 +77,7 @@ volkinclude_HEADERS = \ volk_32fc_magnitude_squared_32f_a.h \ volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ + volk_32fc_x2_multiply_32fc_u.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ volk_32f_s32f_convert_32i_a.h \ -- cgit From ae663decab658be25ac01072fa2f5c8454bd6167 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 17:26:39 -0500 Subject: core: moving multiply_const_ff from gengen to general to take advantage of volk. Also adds SSE and AVX and unaligned Volk versions for this. --- volk/include/volk/volk_32f_s32f_multiply_32f_a.h | 75 ++++++++++++++++++++++ .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 12 ++-- 2 files changed, 81 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index 37223dc81..d1c6f3f65 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -4,6 +4,81 @@ #include #include +#ifdef LV_HAVE_SSE +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC /*! \brief Scalar float multiply diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index a9dfcda19..450a89066 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -9,10 +9,10 @@ #ifdef LV_HAVE_SSE3 #include /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector + \brief Multiplies the input vector by a scalar and stores the results in the third vector \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ @@ -53,10 +53,10 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons #ifdef LV_HAVE_GENERIC /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector + \brief Multiplies the input vector by a scalar and stores the results in the third vector \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ -- cgit From f028a198cb47e0486ad41a29bd3b3dcf0663d766 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 4 Feb 2012 11:04:30 -0500 Subject: volk: new unaligned versions of float multipliers. --- volk/include/volk/Makefile.am | 2 + volk/include/volk/volk_32f_s32f_multiply_32f_u.h | 102 ++++++++++++++++++++ volk/include/volk/volk_32f_x2_multiply_32f_u.h | 106 +++++++++++++++++++++ .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 24 ++--- 4 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 volk/include/volk/volk_32f_s32f_multiply_32f_u.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index eea006bf2..312ff2d5a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -54,6 +54,7 @@ volkinclude_HEADERS = \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ volk_32f_s32f_multiply_32f_a.h \ + volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ @@ -100,6 +101,7 @@ volkinclude_HEADERS = \ volk_32f_x2_max_32f_a.h \ volk_32f_x2_min_32f_a.h \ volk_32f_x2_multiply_32f_a.h \ + volk_32f_x2_multiply_32f_u.h \ volk_32f_s32f_normalize_a.h \ volk_32f_s32f_power_32f_a.h \ volk_32f_sqrt_32f_a.h \ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h new file mode 100644 index 000000000..0e700060f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h new file mode 100644 index 000000000..6c3ce5d83 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 450a89066..218c450f8 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -8,13 +8,13 @@ #ifdef LV_HAVE_SSE3 #include - /*! +/*! \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -52,13 +52,13 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC - /*! +/*! \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; -- cgit From 298623615b249de459cd12b5507dab921fc3210b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 6 Feb 2012 10:31:28 -0500 Subject: volk: added unaligned version of adding 2 vectors. --- volk/include/volk/Makefile.am | 1 + volk/include/volk/volk_32f_x2_add_32f_u.h | 66 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 volk/include/volk/volk_32f_x2_add_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 312ff2d5a..20864efbe 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,6 +53,7 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_x2_add_32f_u.h \ volk_32f_s32f_multiply_32f_a.h \ volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h new file mode 100644 index 000000000..e360a7958 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ -- cgit From 69210086c7ae98b93a63a1d810ee28b304a13520 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 6 Feb 2012 22:02:09 -0500 Subject: volk: added a 32fc multiply conjugate kernel. --- volk/include/volk/Makefile.am | 2 + .../volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 82 ++++++++++++++++++++++ .../volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 81 +++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 20864efbe..d071f18f2 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -59,6 +59,8 @@ volkinclude_HEADERS = \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_a.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..70476a8c7 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(1, -1, 1, -1); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + // FIXME: replace with xor for a faster implementation + y = _mm_mul_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h new file mode 100644 index 000000000..fbaa29c17 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ -- cgit From cdb328758dca9fa494956c0e62f5e78adf613982 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Feb 2012 16:59:50 -0500 Subject: volk: fixed complex multiply and conjugate kernel to use xor for conjugation. --- volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 5 ++--- volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h index 70476a8c7..2a1bcbce0 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -24,15 +24,14 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - __m128 conjugator = _mm_setr_ps(1, -1, 1, -1); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - // FIXME: replace with xor for a faster implementation - y = _mm_mul_ps(y, conjugator); // conjugate y + y = _mm_xor_ps(y, conjugator); // conjugate y yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h index fbaa29c17..92f6a051e 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -24,7 +24,7 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ -- cgit From 3080cd75a6a10aab757e1e02fb99e81e2f3724d5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Feb 2012 17:23:44 -0500 Subject: volk: adding complex conjugate kernel. --- volk/include/volk/Makefile.am | 5 +- volk/include/volk/volk_32fc_conjugate_32fc_a.h | 64 ++++++++++++++++++++++++++ volk/include/volk/volk_32fc_conjugate_32fc_u.h | 64 ++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_a.h create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index d071f18f2..f6b5835b1 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -134,4 +134,7 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a.h \ - volk_8i_s32f_convert_32f_u.h + volk_8i_s32f_convert_32f_u.h \ + volk_32fc_conjugate_32fc_a.h \ + volk_32fc_conjugate_32fc_u.h + diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h new file mode 100644 index 000000000..1518af9be --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_conjugate_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_store_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h new file mode 100644 index 000000000..b26fe0789 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ -- cgit From 2eaa0a6e1e57cfc374c258c317ecb469fc49bf53 Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Tue, 14 Feb 2012 15:37:57 -0800 Subject: build: fix autotools for gnuradio-core volkification --- volk/include/volk/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index f6b5835b1..a01ddf193 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -59,8 +59,8 @@ volkinclude_HEADERS = \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ - volk_32fc_s32fc_multiply_conjugate_32fc_a.h \ - volk_32fc_s32fc_multiply_conjugate_32fc_u.h \ + volk_32fc_x2_multiply_conjugate_32fc_a.h \ + volk_32fc_x2_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ -- cgit From fa8ab7cb146287a9f0d8db67e3126ab4a867a201 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 21 Feb 2012 15:41:27 -0800 Subject: Volk: add scalar const support to the profiler/QA code. Disabled volk_32fc_s32fc_multiply_32fc_a's Orc impl due to it not working. --- volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..75cf8c8b2 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -34,9 +34,9 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); -} +//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +// volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +//} #endif /* LV_HAVE_ORC */ -- cgit From 330cddf31208b843c0997c6fb05cb3facf31f536 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 22 Feb 2012 08:46:55 -0800 Subject: Remove ORC invocation since // doesn't dissuade the generator. --- volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index 75cf8c8b2..665fad47a 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -25,20 +25,6 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ -// volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); -//} -#endif /* LV_HAVE_ORC */ - -- cgit From e8d644872837f4cbfc05851710531b2ac5259806 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 23 Feb 2012 14:28:21 -0500 Subject: volk: float to short conversion is consistent between archs and tail cases. Rounds to nearest number. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 15 ++++++++------- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 15 ++++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 10c921b08..a24959678 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef LV_HAVE_SSE2 #include @@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -98,10 +99,10 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; @@ -111,7 +112,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -138,7 +139,7 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co r = min_val; else if(r > max_val) r = max_val; - *outputVectorPtr++ = (int16_t)(r); + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index f339a7d10..f58158041 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -3,6 +3,7 @@ #include #include +#include #ifdef LV_HAVE_SSE2 #include @@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -99,10 +100,10 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; @@ -112,7 +113,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -140,7 +141,7 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co r = max_val; else if(r < min_val) r = min_val; - *outputVectorPtr++ = (int16_t)(r); + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From c53529ff00012646f4aa47e820bddd48995ab094 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 3 Mar 2012 10:11:28 -0500 Subject: volk: include config.h to have rintf in windows/msvc. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index a24959678..c2a07398f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,6 +1,10 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H +#ifdef HAVE_CONFIG_H +#include +#endif + #include #include #include -- cgit From 4f0add175f8e44922c15ed82d644597eceb65fb6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 3 Mar 2012 10:11:28 -0500 Subject: volk: include config.h to have rintf in windows/msvc. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index a24959678..c2a07398f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,6 +1,10 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H +#ifdef HAVE_CONFIG_H +#include +#endif + #include #include #include -- cgit From 313573dbf02749bc48fd48072f04df08374f2a54 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 9 Mar 2012 10:52:06 -0500 Subject: volk: makes the float-to-int conversion consistent and fixes an overflow bug on 32-bit machines. Not currently used in GNU Radio, so no change in behavior there. --- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 8f2fc791e..15fa282fb 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -22,7 +22,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483648; + float min_val = -2147483647; float max_val = 2147483647; float r; @@ -71,7 +71,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483648; + float min_val = -2147483647; float max_val = 2147483647; float r; diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index d8493454b..d203546c6 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -22,10 +22,8 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - //float min_val = -2147483647; - //float max_val = 2147483647; - float min_val = -2146400000; - float max_val = 2146400000; + float min_val = -2147483647; + float max_val = 2147483647; float r; __m128 vScalar = _mm_set_ps1(scalar); -- cgit From 43224ac7be478cd93f46720377d95f1e53aef87e Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 15 Mar 2012 17:16:18 -0400 Subject: volk: turning off sse implementation of complex dot product for 32-bit machines until it's fixed. --- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index a865e0737..cde9240cc 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -196,7 +196,10 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const #if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + + volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes); + +#if 0 asm volatile ( " #pushl %%ebp\n\t" @@ -307,12 +310,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const } return; - - - - - - +#endif } #endif /*LV_HAVE_SSE*/ -- cgit From d88328072d5ecd0c3c5aef05b705e2b4a1425d9f Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Wed, 21 Mar 2012 16:59:12 -0700 Subject: volk: add include for config.h to cc file volk_32f_s32f_convert_16i_a was also a problem, previously volk_32f_s32f_convert_16i_u was fixed to have config.h. I heard putting config h stuff into public headers was bad practice. Including config.h into the generated cc file should fix this issue from now on. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index c2a07398f..a24959678 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,10 +1,6 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H -#ifdef HAVE_CONFIG_H -#include -#endif - #include #include #include -- cgit From f919f9dcbb54a08e6e26d6c229ce92fb784fa1b2 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 13 Apr 2012 18:36:53 -0400 Subject: Removed whitespace and added dtools/bin/remove-whitespace as a tool to do this in the future. The sed script was provided by Moritz Fischer. --- volk/include/volk/volk_16i_branch_4_state_8_a.h | 118 +++++++++---------- volk/include/volk/volk_16i_convert_8i_a.h | 4 +- volk/include/volk/volk_16i_convert_8i_u.h | 4 +- volk/include/volk/volk_16i_max_star_16i_a.h | 68 +++++------ .../volk/volk_16i_max_star_horizontal_16i_a.h | 76 ++++++------- .../volk/volk_16i_permute_and_scalar_add_a.h | 62 +++++----- volk/include/volk/volk_16i_s32f_convert_32f_a.h | 8 +- volk/include/volk/volk_16i_s32f_convert_32f_u.h | 8 +- .../include/volk/volk_16i_x4_quad_max_star_16i_a.h | 56 ++++----- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h | 22 ++-- .../include/volk/volk_16ic_deinterleave_16i_x2_a.h | 2 +- .../volk/volk_16ic_deinterleave_real_16i_a.h | 2 +- volk/include/volk/volk_16ic_magnitude_16i_a.h | 6 +- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a.h | 6 +- .../volk/volk_16ic_s32f_deinterleave_real_32f_a.h | 10 +- volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h | 14 +-- volk/include/volk/volk_16u_byteswap_a.h | 4 +- volk/include/volk/volk_32f_accumulator_s32f_a.h | 6 +- volk/include/volk/volk_32f_convert_64f_a.h | 6 +- volk/include/volk/volk_32f_convert_64f_u.h | 6 +- volk/include/volk/volk_32f_index_max_16u_a.h | 12 +- .../volk/volk_32f_s32f_32f_fm_detect_32f_a.h | 12 +- ...volk_32f_s32f_calc_spectral_noise_floor_32f_a.h | 8 +- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 8 +- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 8 +- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 12 +- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 8 +- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 10 +- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 10 +- volk/include/volk/volk_32f_s32f_multiply_32f_a.h | 20 ++-- volk/include/volk/volk_32f_s32f_multiply_32f_u.h | 20 ++-- volk/include/volk/volk_32f_s32f_power_32f_a.h | 20 ++-- volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 12 +- volk/include/volk/volk_32f_sqrt_32f_a.h | 10 +- .../volk/volk_32f_stddev_and_mean_32f_x2_a.h | 12 +- volk/include/volk/volk_32f_x2_add_32f_a.h | 10 +- volk/include/volk/volk_32f_x2_add_32f_u.h | 10 +- volk/include/volk/volk_32f_x2_divide_32f_a.h | 10 +- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 32 +++--- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 32 +++--- volk/include/volk/volk_32f_x2_interleave_32fc_a.h | 2 +- volk/include/volk/volk_32f_x2_max_32f_a.h | 10 +- volk/include/volk/volk_32f_x2_min_32f_a.h | 10 +- volk/include/volk/volk_32f_x2_multiply_32f_a.h | 20 ++-- volk/include/volk/volk_32f_x2_multiply_32f_u.h | 20 ++-- .../volk/volk_32f_x2_s32f_interleave_16ic_a.h | 12 +- volk/include/volk/volk_32f_x2_subtract_32f_a.h | 10 +- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 70 ++++++------ volk/include/volk/volk_32fc_32f_multiply_32fc_a.h | 16 +-- volk/include/volk/volk_32fc_conjugate_32fc_a.h | 4 +- volk/include/volk/volk_32fc_conjugate_32fc_u.h | 6 +- .../include/volk/volk_32fc_deinterleave_32f_x2_a.h | 4 +- .../include/volk/volk_32fc_deinterleave_64f_x2_a.h | 8 +- .../volk/volk_32fc_deinterleave_imag_32f_a.h | 2 +- .../volk/volk_32fc_deinterleave_real_32f_a.h | 2 +- .../volk/volk_32fc_deinterleave_real_64f_a.h | 6 +- volk/include/volk/volk_32fc_index_max_16u_a.h | 126 ++++++++++----------- volk/include/volk/volk_32fc_magnitude_32f_a.h | 2 +- volk/include/volk/volk_32fc_magnitude_32f_u.h | 2 +- .../volk/volk_32fc_magnitude_squared_32f_a.h | 2 +- .../volk/volk_32fc_magnitude_squared_32f_u.h | 2 +- volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 20 ++-- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 38 +++---- .../volk/volk_32fc_s32f_power_spectrum_32f_a.h | 24 ++-- ...olk_32fc_s32f_x2_power_spectral_density_32f_a.h | 24 ++-- .../include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 12 +- .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 12 +- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 84 +++++++------- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 32 +++--- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 126 ++++++++++----------- volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 14 +-- volk/include/volk/volk_32fc_x2_multiply_32fc_u.h | 14 +-- .../volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 14 +-- .../volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 14 +-- ...lk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 56 ++++----- volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 40 +++---- volk/include/volk/volk_32i_s32f_convert_32f_a.h | 2 +- volk/include/volk/volk_32i_s32f_convert_32f_u.h | 2 +- volk/include/volk/volk_32i_x2_and_32i_a.h | 10 +- volk/include/volk/volk_32i_x2_or_32i_a.h | 10 +- volk/include/volk/volk_32u_byteswap_a.h | 6 +- volk/include/volk/volk_64f_convert_32f_a.h | 6 +- volk/include/volk/volk_64f_convert_32f_u.h | 6 +- volk/include/volk/volk_64f_x2_max_64f_a.h | 10 +- volk/include/volk/volk_64f_x2_min_64f_a.h | 10 +- volk/include/volk/volk_64u_byteswap_a.h | 18 +-- volk/include/volk/volk_64u_popcnt_a.h | 2 +- volk/include/volk/volk_8i_s32f_convert_32f_a.h | 2 +- volk/include/volk/volk_8i_s32f_convert_32f_u.h | 2 +- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a.h | 8 +- .../volk/volk_8ic_s32f_deinterleave_real_32f_a.h | 10 +- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 8 +- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 4 +- 93 files changed, 870 insertions(+), 870 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h index 0424e66e9..6338fbdd1 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -3,7 +3,7 @@ #include -#include +#include @@ -15,32 +15,32 @@ #include static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - + + p_target = (__m128i*)target; p_src0 = (__m128i*)src0; p_cntl2 = (__m128i*)cntl2; p_cntl3 = (__m128i*)cntl3; p_scalars = (__m128i*)scalars; - + int i = 0; - + int bound = 1; - - + + xmm0 = _mm_load_si128(p_scalars); - + xmm1 = _mm_shufflelo_epi16(xmm0, 0); xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); xmm2 = _mm_shuffle_epi32(xmm2, 0x00); xmm3 = _mm_shuffle_epi32(xmm3, 0x00); @@ -52,40 +52,40 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src xmm10 = _mm_load_si128((__m128i*)permuters[3]); for(; i < bound; ++i) { - + xmm5 = _mm_load_si128(p_src0); - - - - - - - + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); xmm6 = _mm_shuffle_epi8(xmm5, xmm6); xmm8 = _mm_shuffle_epi8(xmm5, xmm8); xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - + p_src0 += 4; - - + + xmm5 = _mm_add_epi16(xmm1, xmm2); - + xmm6 = _mm_add_epi16(xmm2, xmm6); xmm8 = _mm_add_epi16(xmm1, xmm8); - - + + xmm7 = _mm_load_si128(p_cntl2); xmm9 = _mm_load_si128(p_cntl3); - + xmm0 = _mm_add_epi16(xmm5, xmm0); - - + + xmm7 = _mm_and_si128(xmm7, xmm3); xmm9 = _mm_and_si128(xmm9, xmm4); - + xmm5 = _mm_load_si128(&p_cntl2[1]); xmm11 = _mm_load_si128(&p_cntl3[1]); @@ -95,96 +95,96 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src xmm11 = _mm_and_si128(xmm11, xmm4); xmm0 = _mm_add_epi16(xmm0, xmm7); - - - + + + xmm7 = _mm_load_si128(&p_cntl2[2]); xmm9 = _mm_load_si128(&p_cntl3[2]); - + xmm5 = _mm_add_epi16(xmm5, xmm11); - + xmm7 = _mm_and_si128(xmm7, xmm3); xmm9 = _mm_and_si128(xmm9, xmm4); - + xmm6 = _mm_add_epi16(xmm6, xmm5); - - + + xmm5 = _mm_load_si128(&p_cntl2[3]); xmm11 = _mm_load_si128(&p_cntl3[3]); - + xmm7 = _mm_add_epi16(xmm7, xmm9); - + xmm5 = _mm_and_si128(xmm5, xmm3); xmm11 = _mm_and_si128(xmm11, xmm4); - + xmm8 = _mm_add_epi16(xmm8, xmm7); - + xmm5 = _mm_add_epi16(xmm5, xmm11); - + _mm_store_si128(p_target, xmm0); _mm_store_si128(&p_target[1], xmm6); xmm10 = _mm_add_epi16(xmm5, xmm10); - + _mm_store_si128(&p_target[2], xmm8); - + _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; + + p_target += 3; } } - - + + #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; - + int bound = 4; - + for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] + target[i* 8] = src0[((char)permuters[i][0])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 1] & scalars[2]) + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 2] & scalars[2]) + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 3] & scalars[2]) + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 4] & scalars[2]) + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 5] & scalars[2]) + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 6] & scalars[2]) + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 7] & scalars[2]) + (cntl3[i * 8 + 7] & scalars[3]); - + } } diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h index 8046035c7..84548c8c5 100644 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -15,7 +15,7 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + int8_t* outputVectorPtr = outputVector; int16_t* inputPtr = (int16_t*)inputVector; __m128i inputVal1; @@ -30,7 +30,7 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_ inputVal1 = _mm_srai_epi16(inputVal1, 8); inputVal2 = _mm_srai_epi16(inputVal2, 8); - + ret = _mm_packs_epi16(inputVal1, inputVal2); _mm_store_si128((__m128i*)outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h index df1084fe0..80608a141 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -16,7 +16,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + int8_t* outputVectorPtr = outputVector; int16_t* inputPtr = (int16_t*)inputVector; __m128i inputVal1; @@ -31,7 +31,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_ inputVal1 = _mm_srai_epi16(inputVal1, 8); inputVal2 = _mm_srai_epi16(inputVal2, 8); - + ret = _mm_packs_epi16(inputVal1, inputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h index 28197ddef..edfff8a82 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -3,7 +3,7 @@ #include -#include +#include #ifdef LV_HAVE_SSSE3 @@ -15,82 +15,82 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) { - + short candidate = src0[0]; short cands[8]; __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; - + __m128i *p_src0; - + p_src0 = (__m128i*)src0; int bound = num_bytes >> 4; int leftovers = (num_bytes >> 1) & 7; - + int i = 0; - - + + xmm1 = _mm_setzero_si128(); xmm0 = _mm_setzero_si128(); //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + for(i = 0; i < bound; ++i) { xmm1 = _mm_load_si128(p_src0); p_src0 += 1; //xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); xmm6 = _mm_xor_si128(xmm4, xmm5); - + xmm3 = _mm_and_si128(xmm3, xmm0); xmm4 = _mm_and_si128(xmm6, xmm1); - + xmm0 = _mm_add_epi16(xmm3, xmm4); - - + + } - + _mm_store_si128((__m128i*)cands, xmm0); - + for(i = 0; i < 8; ++i) { candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; } - - - + + + for(i = 0; i < leftovers; ++i) { - + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; } target[0] = candidate; - - - - -} - + + + + +} + #endif /*LV_HAVE_SSSE3*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; short candidate = src0[0]; @@ -98,7 +98,7 @@ static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, u candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; } target[0] = candidate; - + } diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index a10a62350..c1c908425 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -4,7 +4,7 @@ #include #include -#include +#include #ifdef LV_HAVE_SSSE3 @@ -20,107 +20,107 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i xmm5, xmm6, xmm7, xmm8; - + xmm4 = _mm_load_si128((__m128i*)shufmask0); xmm5 = _mm_load_si128((__m128i*)shufmask1); xmm6 = _mm_load_si128((__m128i*)andmask0); xmm7 = _mm_load_si128((__m128i*)andmask1); - + __m128i *p_target, *p_src0; - + p_target = (__m128i*)target; p_src0 = (__m128i*)src0; int bound = num_bytes >> 5; int intermediate = (num_bytes >> 4) & 1; int leftovers = (num_bytes >> 1) & 7; - + int i = 0; - - + + for(i = 0; i < bound; ++i) { - + xmm0 = _mm_load_si128(p_src0); xmm1 = _mm_load_si128(&p_src0[1]); - - + + xmm2 = _mm_xor_si128(xmm2, xmm2); p_src0 += 2; - + xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); xmm8 = _mm_and_si128(xmm2, xmm6); xmm3 = _mm_and_si128(xmm2, xmm7); - + xmm8 = _mm_add_epi8(xmm8, xmm4); xmm3 = _mm_add_epi8(xmm3, xmm5); xmm0 = _mm_shuffle_epi8(xmm0, xmm8); xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - + + xmm3 = _mm_add_epi16(xmm0, xmm1); - + _mm_store_si128(p_target, xmm3); - + p_target += 1; - + } for(i = 0; i < intermediate; ++i) { - + xmm0 = _mm_load_si128(p_src0); - - + + xmm2 = _mm_xor_si128(xmm2, xmm2); p_src0 += 1; - + xmm3 = _mm_hsub_epi16(xmm0, xmm1); xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); xmm8 = _mm_and_si128(xmm2, xmm6); - + xmm3 = _mm_add_epi8(xmm8, xmm4); - + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - + _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); - + p_target = (__m128i*)((int8_t*)p_target + 8); } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; } - -} - + +} + #endif /*LV_HAVE_SSSE3*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; - + for(i = 0; i < bound; i += 2) { target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; } - + } diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h index de36cee80..47e3cbf9c 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -3,7 +3,7 @@ #include -#include +#include @@ -14,33 +14,33 @@ #include static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; short* p_permute_indexes = permute_indexes; - + p_target = (__m128i*)target; p_cntl0 = (__m128i*)cntl0; p_cntl1 = (__m128i*)cntl1; p_cntl2 = (__m128i*)cntl2; p_cntl3 = (__m128i*)cntl3; p_scalars = (__m128i*)scalars; - + int i = 0; - + int bound = (num_bytes >> 4); int leftovers = (num_bytes >> 1) & 7; - + xmm0 = _mm_load_si128(p_scalars); - + xmm1 = _mm_shufflelo_epi16(xmm0, 0); xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); xmm2 = _mm_shuffle_epi32(xmm2, 0x00); xmm3 = _mm_shuffle_epi32(xmm3, 0x00); @@ -64,49 +64,49 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short xmm0 = _mm_add_epi16(xmm0, xmm5); xmm6 = _mm_add_epi16(xmm6, xmm7); - + p_permute_indexes += 8; - + xmm0 = _mm_add_epi16(xmm0, xmm6); - + xmm5 = _mm_load_si128(p_cntl0); xmm6 = _mm_load_si128(p_cntl1); xmm7 = _mm_load_si128(p_cntl2); - + xmm5 = _mm_and_si128(xmm5, xmm1); xmm6 = _mm_and_si128(xmm6, xmm2); xmm7 = _mm_and_si128(xmm7, xmm3); - + xmm0 = _mm_add_epi16(xmm0, xmm5); - + xmm5 = _mm_load_si128(p_cntl3); - + xmm6 = _mm_add_epi16(xmm6, xmm7); p_cntl0 += 1; - + xmm5 = _mm_and_si128(xmm5, xmm4); - + xmm0 = _mm_add_epi16(xmm0, xmm6); - + p_cntl1 += 1; p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - + + xmm0 = _mm_add_epi16(xmm0, xmm5); + p_cntl3 += 1; _mm_store_si128(p_target, xmm0); - + p_target += 1; } - - - - + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) @@ -118,18 +118,18 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short #ifdef LV_HAVE_GENERIC static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + (cntl3[i] & scalars[3]); - + } } diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h index 0555fdf00..7108ff659 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -36,7 +36,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const // Convert the lower 4 values into 32 bit words inputVal = _mm_cvtepi16_epi32(inputVal); inputVal2 = _mm_cvtepi16_epi32(inputVal2); - + ret = _mm_cvtepi32_ps(inputVal); ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); @@ -71,7 +71,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -79,7 +79,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const in for(;number < quarterPoints; number++){ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - + ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h index d34acc091..4ce8e8f35 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -37,7 +37,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const // Convert the lower 4 values into 32 bit words inputVal = _mm_cvtepi16_epi32(inputVal); inputVal2 = _mm_cvtepi16_epi32(inputVal2); - + ret = _mm_cvtepi32_ps(inputVal); ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); @@ -73,7 +73,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -81,7 +81,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in for(;number < quarterPoints; number++){ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - + ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h index 2688aff04..0d8498553 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -3,7 +3,7 @@ #include -#include +#include @@ -14,7 +14,7 @@ #include static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - + @@ -23,41 +23,41 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s int bound = (num_bytes >> 4); int bound_copy = bound; int leftovers = (num_bytes >> 1) & 7; - + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; p_target = (__m128i*) target; p_src0 = (__m128i*)src0; p_src1 = (__m128i*)src1; p_src2 = (__m128i*)src2; p_src3 = (__m128i*)src3; - - + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; while(bound_copy > 0) { - + xmm1 = _mm_load_si128(p_src0); xmm2 = _mm_load_si128(p_src1); xmm3 = _mm_load_si128(p_src2); xmm4 = _mm_load_si128(p_src3); - + xmm5 = _mm_setzero_si128(); xmm6 = _mm_setzero_si128(); xmm7 = xmm1; xmm8 = xmm3; - - + + xmm1 = _mm_sub_epi16(xmm2, xmm1); - + xmm3 = _mm_sub_epi16(xmm4, xmm3); xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - + xmm2 = _mm_and_si128(xmm5, xmm2); xmm4 = _mm_and_si128(xmm6, xmm4); @@ -67,7 +67,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s xmm5 = _mm_add_epi16(xmm2, xmm5); xmm6 = _mm_add_epi16(xmm4, xmm6); - + xmm1 = _mm_xor_si128(xmm1, xmm1); xmm2 = xmm5; xmm5 = _mm_sub_epi16(xmm6, xmm5); @@ -76,23 +76,23 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); p_src1 += 1; - + xmm6 = _mm_and_si128(xmm1, xmm6); - + xmm1 = _mm_andnot_si128(xmm1, xmm2); p_src2 += 1; - + xmm1 = _mm_add_epi16(xmm6, xmm1); p_src3 += 1; - + _mm_store_si128(p_target, xmm1); p_target += 1; - + } - + /*asm volatile ( @@ -111,25 +111,25 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s "movaps %%xmm3, %%xmm8\n\t" "psubw %%xmm2, %%xmm1\n\t" "psubw %%xmm4, %%xmm3\n\t" - + "pcmpgtw %%xmm1, %%xmm5\n\t" "pcmpgtw %%xmm3, %%xmm6\n\t" - + "pand %%xmm5, %%xmm2\n\t" "pand %%xmm6, %%xmm4\n\t" "pandn %%xmm7, %%xmm5\n\t" "pandn %%xmm8, %%xmm6\n\t" - + "paddw %%xmm2, %%xmm5\n\t" "paddw %%xmm4, %%xmm6\n\t" "pxor %%xmm1, %%xmm1\n\t" "movaps %%xmm5, %%xmm2\n\t" - + "psubw %%xmm6, %%xmm5\n\t" "add $16, %[src0]\n\t" "add $-1, %[bound]\n\t" - + "pcmpgtw %%xmm5, %%xmm1\n\t" "add $16, %[src1]\n\t" @@ -144,13 +144,13 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s "movaps %%xmm1, (%[target])\n\t" "addw $16, %[target]\n\t" "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" - + "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) : ); - */ + */ short temp0 = 0; short temp1 = 0; @@ -169,11 +169,11 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s #ifdef LV_HAVE_GENERIC static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; - + short temp0 = 0; short temp1 = 0; for(i = 0; i < bound; ++i) { diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h index e4c9f17ed..5560b92d9 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -3,7 +3,7 @@ #include -#include +#include @@ -14,7 +14,7 @@ #include static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; p_target0 = (__m128i*)target0; @@ -39,16 +39,16 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta xmm2 = _mm_load_si128(p_src2); xmm3 = _mm_load_si128(p_src3); xmm4 = _mm_load_si128(p_src4); - + p_src0 += 1; p_src1 += 1; - + xmm1 = _mm_add_epi16(xmm0, xmm1); xmm2 = _mm_add_epi16(xmm0, xmm2); xmm3 = _mm_add_epi16(xmm0, xmm3); xmm4 = _mm_add_epi16(xmm0, xmm4); - - + + p_src2 += 1; p_src3 += 1; p_src4 += 1; @@ -57,7 +57,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta _mm_store_si128(p_target1, xmm2); _mm_store_si128(p_target2, xmm3); _mm_store_si128(p_target3, xmm4); - + p_target0 += 1; p_target1 += 1; p_target2 += 1; @@ -97,9 +97,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); - + */ - + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { target0[i] = src0[i] + src1[i]; @@ -114,9 +114,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta #ifdef LV_HAVE_GENERIC static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; for(i = 0; i < bound; ++i) { diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h index cdd60235e..f8aa30874 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); unsigned int eighthPoints = num_points / 8; - + for(number = 0; number < eighthPoints; number++){ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h index 2b99e068e..bac1f2e4b 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -64,7 +64,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); unsigned int eighthPoints = num_points / 8; - + for(number = 0; number < eighthPoints; number++){ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h index a6951e967..317075e85 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -17,7 +17,7 @@ static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; @@ -35,7 +35,7 @@ static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, cons inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -106,7 +106,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + cplxValue1 = _mm_load_ps(inputFloatBuffer); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h index e73d405e0..1300395ff 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -20,7 +20,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa float* qBufferPtr = qBuffer; uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; + const uint64_t quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; __m128 invScalar = _mm_set_ps1(1.0/scalar); @@ -29,12 +29,12 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ - + floatBuffer[0] = (float)(complexVectorPtr[0]); floatBuffer[1] = (float)(complexVectorPtr[1]); floatBuffer[2] = (float)(complexVectorPtr[2]); floatBuffer[3] = (float)(complexVectorPtr[3]); - + floatBuffer[4] = (float)(complexVectorPtr[4]); floatBuffer[5] = (float)(complexVectorPtr[5]); floatBuffer[6] = (float)(complexVectorPtr[6]); diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h index 1630cb0ed..5e2d82b94 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iFloatValue; @@ -49,7 +49,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; sixteenTComplexVectorPtr++; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -66,7 +66,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iValue; const float iScalar = 1.0/scalar; @@ -77,7 +77,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; @@ -96,7 +96,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h index 35406e2cb..d20eea1a7 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; @@ -34,7 +34,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -56,7 +56,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c result = _mm_sqrt_ps(result); // Square root the values _mm_store_ps(magnitudeVectorPtr, result); - + magnitudeVectorPtr += 4; } @@ -99,7 +99,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -107,7 +107,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); @@ -124,7 +124,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co result = _mm_sqrt_ps(result); // Square root the values _mm_store_ps(magnitudeVectorPtr, result); - + magnitudeVectorPtr += 4; } @@ -138,7 +138,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co } } - + #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h index 75c7ef0f3..fc3eb5fa7 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -31,9 +31,9 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n inputPtr += 8; } - + // Byteswap any remaining points: - number = eighthPoints*8; + number = eighthPoints*8; for(; number < num_points; number++){ uint16_t outputVal = *inputPtr; outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h index 7ce0d1c80..78364d0a0 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -20,13 +20,13 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i const float* aPtr = inputBuffer; __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - + __m128 accumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); + accumulator = _mm_add_ps(accumulator, aVal); aPtr += 4; } _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container @@ -34,7 +34,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i returnValue += tempBuffer[1]; returnValue += tempBuffer[2]; returnValue += tempBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr++); diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h index dda646409..2c469ac42 100644 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; double* outputVectorPtr = outputVector; __m128d ret; @@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float for(;number < quarterPoints; number++){ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - + ret = _mm_cvtps_pd(inputVal); _mm_store_pd(outputVectorPtr, ret); @@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float outputVectorPtr += 2; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (double)(inputVector[number]); } diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h index 387baa3b9..10d8a4f6c 100644 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; double* outputVectorPtr = outputVector; __m128d ret; @@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float for(;number < quarterPoints; number++){ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - + ret = _mm_cvtps_pd(inputVal); _mm_storeu_pd(outputVectorPtr, ret); @@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float outputVectorPtr += 2; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (double)(inputVector[number]); } diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h index 0c43a5081..b9ca1dd3e 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -52,7 +52,7 @@ static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const f } number = quarterPoints * 4; - for(;number < num_points; number++){ + for(;number < num_points; number++){ if(src0[number] > max){ index = number; max = src0[number]; @@ -111,7 +111,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa } number = quarterPoints * 4; - for(;number < num_points; number++){ + for(;number < num_points; number++){ if(src0[number] > max){ index = number; max = src0[number]; @@ -128,11 +128,11 @@ static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const if(num_points > 0){ float max = src0[0]; unsigned int index = 0; - - unsigned int i = 1; - + + unsigned int i = 1; + for(; i < num_points; ++i) { - + if(src0[i] > max){ index = i; max = src0[i]; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h index b25df75a1..43713f8b5 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -46,7 +46,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co inPtr++; outPtr++; } - + for (; number < quarterPoints; number++) { // Load data next3old1 = _mm_loadu_ps((float*) (inPtr-1)); @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co _mm_store_ps(outPtr,next3old1); // Store the results back into the output outPtr += 4; } - + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { *outPtr = *(inPtr) - *(inPtr-1); if (*outPtr > bound) *outPtr -= 2*bound; @@ -73,7 +73,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co inPtr++; outPtr++; } - + *saveValue = inputVector[num_points-1]; } #endif /* LV_HAVE_SSE */ @@ -94,14 +94,14 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector unsigned int number = 0; float* outPtr = outputVector; const float* inPtr = inputVector; - + // Do the first 1 by hand since we're going in from the saveValue: *outPtr = *inPtr - *saveValue; if (*outPtr > bound) *outPtr -= 2*bound; if (*outPtr < -bound) *outPtr += 2*bound; inPtr++; outPtr++; - + for (number = 1; number < num_points; number++) { *outPtr = *(inPtr) - *(inPtr-1); if (*outPtr > bound) *outPtr -= 2*bound; @@ -109,7 +109,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector inPtr++; outPtr++; } - + *saveValue = inputVector[num_points-1]; } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h index b1902a8c0..db61e359d 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -23,7 +23,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois const float* dataPointsPtr = realDataPoints; __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; - + __m128 dataPointsVal; __m128 avgPointsVal = _mm_setzero_ps(); // Calculate the sum (for mean) for all points @@ -73,11 +73,11 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - + // Count the number of bins which do not exceed the mean amplitude vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); } - + // Calculate the mean from the remaining data points _mm_store_ps(avgPointsVector, avgPointsVal); @@ -104,7 +104,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois validBinCount += 1.0; } } - + float localNoiseFloorAmplitude = 0; if(validBinCount > 0.0){ localNoiseFloorAmplitude = sumMean / validBinCount; diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index a24959678..9df4946f2 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -76,7 +76,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -105,7 +105,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index f58158041..56e42c9bd 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -77,7 +77,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 15fa282fb..38e6b2e74 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -67,7 +67,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -91,7 +91,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -116,7 +116,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -144,7 +144,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index d203546c6..ee15edb46 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -68,7 +68,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -96,7 +96,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 05172171c..800017d5d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f intInputVal2 = _mm_cvtps_epi32(inputVal2); intInputVal3 = _mm_cvtps_epi32(inputVal3); intInputVal4 = _mm_cvtps_epi32(inputVal4); - + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f outputVectorPtr += 16; } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -82,7 +82,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; float min_val = -128; @@ -110,7 +110,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 12991e9c1..870e9419b 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f intInputVal2 = _mm_cvtps_epi32(inputVal2); intInputVal3 = _mm_cvtps_epi32(inputVal3); intInputVal4 = _mm_cvtps_epi32(inputVal4); - + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f outputVectorPtr += 16; } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -83,7 +83,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -111,7 +111,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index d1c6f3f65..99b8e68c5 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* __m128 aVal, bVal, cVal; bVal = _mm_set_ps1(scalar); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* __m256 aVal, bVal, cVal; bVal = _mm256_set1_ps(scalar); for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h index 0e700060f..b3fae9b05 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* __m128 aVal, bVal, cVal; bVal = _mm_set_ps1(scalar); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* __m256 aVal, bVal, cVal; bVal = _mm256_set1_ps(scalar); for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h index 09c905961..633ad14b0 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -21,7 +21,7 @@ */ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + float* cPtr = cVector; const float* aPtr = aVector; @@ -33,22 +33,22 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* __m128 negatedValues; __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); __m128 onesMask = _mm_set_ps1(1); - + __m128 aVal, cVal; for(;number < quarterPoints; number++){ - + aVal = _mm_load_ps(aPtr); signMask = _mm_cmplt_ps(aVal, zeroValue); negatedValues = _mm_sub_ps(zeroValue, aVal); aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after cVal = powf4(aVal, vPower); // Takes each input value to the specified power cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); _mm_store_ps(cPtr,cVal); // Store the results back into the C container - + aPtr += 4; cPtr += 4; } @@ -78,7 +78,7 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* */ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + float* cPtr = cVector; const float* aPtr = aVector; @@ -90,22 +90,22 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV __m128 negatedValues; __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); __m128 onesMask = _mm_set_ps1(1); - + __m128 aVal, cVal; for(;number < quarterPoints; number++){ - + aVal = _mm_load_ps(aPtr); signMask = _mm_cmplt_ps(aVal, zeroValue); negatedValues = _mm_sub_ps(zeroValue, aVal); aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after cVal = powf4(aVal, vPower); // Takes each input value to the specified power cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); _mm_store_ps(cPtr,cVal); // Store the results back into the C container - + aPtr += 4; cPtr += 4; } diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h index 75fe0cb2e..98401b2d4 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -29,7 +29,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* __m128 aVal1, aVal2, aVal3, aVal4; __m128 cVal1, cVal2, cVal3, cVal4; for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); aVal2 = _mm_load_ps(aPtr); aPtr += 4; @@ -47,12 +47,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = sixteenthPoints * 16; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -93,12 +93,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in squareAccumulator = _mm_add_ps(squareAccumulator, aVal); aPtr += 4; } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -125,7 +125,7 @@ static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float if(num_points > 0){ const float* aPtr = inputBuffer; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); aPtr++; diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h index e44c73cfd..d9b16fc0f 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -22,11 +22,11 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, __m128 aVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h index 20ff676d8..7de32f7b1 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -31,7 +31,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float __m128 aVal1, aVal2, aVal3, aVal4; __m128 cVal1, cVal2, cVal3, cVal4; for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x @@ -54,7 +54,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 } _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container newMean = meanBuffer[0]; newMean += meanBuffer[1]; newMean += meanBuffer[2]; @@ -63,7 +63,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = sixteenthPoints * 16; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -110,7 +110,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m aPtr += 4; } _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container newMean = meanBuffer[0]; newMean += meanBuffer[1]; newMean += meanBuffer[2]; @@ -119,7 +119,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -149,7 +149,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, floa if(num_points > 0){ const float* aPtr = inputBuffer; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); newMean += *aPtr++; diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h index 3bc83653b..51e63e54d 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - + + cVal = _mm_add_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h index e360a7958..52e8286bc 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_u.h +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - + + cVal = _mm_add_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h index 52ddfae87..7b60fb22e 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - + + cVal = _mm_div_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 0c58f2ecf..448b2fdc0 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } - + *result = dotProduct; } @@ -29,7 +29,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - + unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -42,11 +42,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_add_ps(cVal, dotProdVal); @@ -69,10 +69,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* } *result = dotProduct; - + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 @@ -91,11 +91,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_hadd_ps(dotProdVal, cVal); @@ -117,7 +117,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float __m128 dotProdVal = _mm_setzero_ps(); - for(;number < sixteenthPoints; number++){ + for(;number < sixteenthPoints; number++){ aVal1 = _mm_load_ps(aPtr); aPtr += 4; aVal2 = _mm_load_ps(aPtr); aPtr += 4; @@ -151,7 +151,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float bVal2 = _mm_load_ps(bPtr); bPtr += 4; bVal3 = _mm_load_ps(bPtr); bPtr += 4; bVal4 = _mm_load_ps(bPtr); bPtr += 4; - + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); @@ -178,7 +178,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 7f47122ff..3b7284b57 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -17,7 +17,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } - + *result = dotProduct; } @@ -28,7 +28,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - + unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -41,11 +41,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_add_ps(cVal, dotProdVal); @@ -68,10 +68,10 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* } *result = dotProduct; - + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 @@ -90,11 +90,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_hadd_ps(dotProdVal, cVal); @@ -116,7 +116,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float __m128 dotProdVal = _mm_setzero_ps(); for(;number < sixteenthPoints; number++){ - + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; @@ -150,7 +150,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); @@ -177,7 +177,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h index 1d4d2dbbd..52d80b6bb 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -20,7 +20,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c const float* qBufferPtr = qBuffer; const uint64_t quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue; for(;number < quarterPoints; number++){ iValue = _mm_load_ps(iBufferPtr); diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h index 7948c458d..79f2d04b5 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - + + cVal = _mm_max_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h index d77134868..42cac0833 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - + + cVal = _mm_min_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h index fae9a652f..340e05165 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + cVal = _mm_mul_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a __m256 aVal, bVal, cVal; for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); + + aVal = _mm256_load_ps(aPtr); bVal = _mm256_load_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h index 6c3ce5d83..bfb896d60 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + cVal = _mm_mul_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* a __m256 aVal, bVal, cVal; for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); + + aVal = _mm256_loadu_ps(aPtr); bVal = _mm256_loadu_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h index cc02c3678..10fc267dc 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -23,7 +23,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec __m128 vScalar = _mm_set_ps1(scalar); const unsigned int quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue1, cplxValue2; __m128i intValue1, intValue2; @@ -59,7 +59,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); } - + } #endif /* LV_HAVE_SSE2 */ @@ -81,7 +81,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect __m128 vScalar = _mm_set_ps1(scalar); const unsigned int quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue; int16_t* complexVectorPtr = (int16_t*)complexVector; @@ -106,9 +106,9 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect // Interleaves the upper two values in the i and q variables into one buffer cplxValue = _mm_unpackhi_ps(iValue, qValue); cplxValue = _mm_mul_ps(cplxValue, vScalar); - + _mm_store_ps(floatBuffer, cplxValue); - + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); *complexVectorPtr++ = (int16_t)(floatBuffer[1]); *complexVectorPtr++ = (int16_t)(floatBuffer[2]); @@ -124,7 +124,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h index 16cad008a..e2b8be797 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - + + cVal = _mm_sub_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h index 153bb3a25..3c530628c 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -14,33 +14,33 @@ #include static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - + + float result = 0.0; float fst = 0.0; float sq = 0.0; float thrd = 0.0; float frth = 0.0; //float fith = 0.0; - - - + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; xmm9 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); - + xmm0 = _mm_load1_ps(¢er_point_array[0]); xmm6 = _mm_load1_ps(¢er_point_array[1]); xmm7 = _mm_load1_ps(¢er_point_array[2]); xmm8 = _mm_load1_ps(¢er_point_array[3]); //xmm11 = _mm_load1_ps(¢er_point_array[4]); xmm10 = _mm_load1_ps(cutoff); - + int bound = num_bytes >> 4; int leftovers = (num_bytes >> 2) & 3; int i = 0; - + for(; i < bound; ++i) { xmm2 = _mm_load_ps(src0); xmm2 = _mm_max_ps(xmm10, xmm2); @@ -57,23 +57,23 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 xmm2 = _mm_add_ps(xmm2, xmm3); xmm3 = _mm_add_ps(xmm4, xmm5); - + src0 += 4; - + xmm9 = _mm_add_ps(xmm2, xmm9); - + xmm1 = _mm_add_ps(xmm3, xmm1); //xmm9 = _mm_add_ps(xmm12, xmm9); } - + xmm2 = _mm_hadd_ps(xmm9, xmm1); xmm3 = _mm_hadd_ps(xmm2, xmm2); xmm4 = _mm_hadd_ps(xmm3, xmm3); _mm_store_ss(&result, xmm4); - - + + for(i = 0; i < leftovers; ++i) { fst = src0[i]; @@ -82,11 +82,11 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 thrd = fst * sq; frth = sq * sq; //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + //center_point_array[4] * fith); } @@ -94,7 +94,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 target[0] = result; } - + #endif /*LV_HAVE_SSE3*/ @@ -103,45 +103,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - + float result = 0.0; float fst = 0.0; float sq = 0.0; float thrd = 0.0; float frth = 0.0; //float fith = 0.0; - - unsigned int i = 0; - + + unsigned int i = 0; + for(; i < num_bytes >> 2; ++i) { fst = src0[i]; fst = MAX(fst, *cutoff); - + sq = fst * fst; thrd = fst * sq; frth = sq * sq; //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); //+ //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth) + - //center_point_array[4] * fith) + + //center_point_array[4] * fith) + (center_point_array[4]), i); */ } result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - + + *target = result; } diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h index b7350b9fa..28d584bf2 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -23,11 +23,11 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; for(;number < quarterPoints; number++){ - + aVal1 = _mm_load_ps((const float*)aPtr); aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); + + aVal2 = _mm_load_ps((const float*)aPtr); aPtr += 2; bVal = _mm_load_ps(bPtr); @@ -36,13 +36,13 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - cVal = _mm_mul_ps(aVal1, bVal1); - + cVal = _mm_mul_ps(aVal1, bVal1); + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container cPtr += 2; - cVal = _mm_mul_ps(aVal2, bVal2); - + cVal = _mm_mul_ps(aVal2, bVal2); + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container cPtr += 2; @@ -69,7 +69,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ *cPtr++ = (*aPtr++) * (*bPtr++); } diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h index 1518af9be..919280d51 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -25,11 +25,11 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi x = _mm_xor_ps(x, conjugator); // conjugate register - + _mm_store_ps((float*)c,x); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h index b26fe0789..e0d79ea7b 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -21,13 +21,13 @@ static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_ __m128 x; lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; - + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - + x = _mm_xor_ps(x, conjugator); // conjugate register _mm_storeu_ps((float*)c,x); // Store the results back into the C container diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h index 9de036ef4..4106f3851 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -19,10 +19,10 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h index 29c369d9a..77566e671 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -20,23 +20,23 @@ static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* double* iBufferPtr = iBuffer; double* qBufferPtr = qBuffer; - const unsigned int halfPoints = num_points / 2; + const unsigned int halfPoints = num_points / 2; __m128 cplxValue, fVal; __m128d dVal; for(;number < halfPoints; number++){ - + cplxValue = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Arrange in i1i2i1i2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(iBufferPtr, dVal); // Arrange in q1q2q1q2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(qBufferPtr, dVal); iBufferPtr += 2; diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h index adc4112b9..c88809beb 100644 --- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l __m128 cplxValue1, cplxValue2, iValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h index a1d0fd5d1..0d6c6b7af 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l __m128 cplxValue1, cplxValue2, iValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h index 70a3b1971..1e346baca 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -18,17 +18,17 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; - const unsigned int halfPoints = num_points / 2; + const unsigned int halfPoints = num_points / 2; __m128 cplxValue, fVal; __m128d dVal; for(;number < halfPoints; number++){ - + cplxValue = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Arrange in i1i2i1i2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(iBufferPtr, dVal); iBufferPtr += 2; diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 125a34582..842a6a042 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -12,16 +12,16 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - + + + union bit128 holderf; union bit128 holderi; float sq_dist = 0.0; - + union bit128 xmm5, xmm4; __m128 xmm1, xmm2, xmm3; __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; @@ -30,63 +30,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm4.int_vec = xmmfour = _mm_setzero_si128(); holderf.int_vec = holder0 = _mm_setzero_si128(); holderi.int_vec = holder1 = _mm_setzero_si128(); - - + + int bound = num_bytes >> 5; int leftovers0 = (num_bytes >> 4) & 1; int leftovers1 = (num_bytes >> 3) & 1; int i = 0; - - + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! xmm9 = xmm8 = _mm_setzero_si128(); xmm10 = _mm_set_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); ; - + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - + for(; i < bound; ++i) { - + xmm1 = _mm_load_ps((float*)src0); xmm2 = _mm_load_ps((float*)&src0[2]); - + src0 += 4; - - + + xmm1 = _mm_mul_ps(xmm1, xmm1); xmm2 = _mm_mul_ps(xmm2, xmm2); - - + + xmm1 = _mm_hadd_ps(xmm1, xmm2); xmm3 = _mm_max_ps(xmm1, xmm3); - + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm8 = _mm_add_epi32(xmm8, xmm10); + - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); } - - + + for(i = 0; i < leftovers0; ++i) { xmm2 = _mm_load_ps((float*)src0); - + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); xmm8 = bit128_p(&xmm1)->int_vec; @@ -99,63 +99,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm3 = _mm_max_ps(xmm1, xmm3); xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm8 = _mm_add_epi32(xmm8, xmm10); //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); } - - - + + + for(i = 0; i < leftovers1; ++i) { //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - + xmm2 = _mm_load1_ps(&sq_dist); xmm1 = xmm3; - + xmm3 = _mm_max_ss(xmm3, xmm2); - - + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); } - + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); _mm_store_ps((float*)&(holderf.f), xmm3); _mm_store_si128(&(holderi.int_vec), xmm9); - + target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; + sq_dist = holderf.f[0]; target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; @@ -163,27 +163,27 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - + + /* float placeholder = 0.0; - uint32_t temp0, temp1; + uint32_t temp0, temp1; unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); unsigned int l0 = g0 ^ 1; unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); unsigned int l1 = g1 ^ 1; - + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - + g0 = (sq_dist > placeholder); l0 = g0 ^ 1; target[0] = g0 * temp0 + l0 * temp1; */ - + } #endif /*LV_HAVE_SSE3*/ @@ -193,20 +193,20 @@ static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32 float sq_dist = 0.0; float max = 0.0; unsigned int index = 0; - - unsigned int i = 0; - + + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - + index = sq_dist > max ? i : index; max = sq_dist > max ? sq_dist : max; - - + + } target[0] = index; - + } #endif /*LV_HAVE_GENERIC*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h index f18e9bc0b..efb84a904 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h index ed1cedef9..c8b3f0a08 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_u.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h index 00bdefbb5..d3ac9717a 100644 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h index 6eb4a523a..53a4e68eb 100644 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h index 7bd001aa0..d86bd63c1 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -27,14 +27,14 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); __m128 phase; __m128 complex1, complex2, iValue, qValue; __m128 keepMask; - + for (; number < quarterPoints; number++) { // Load IQ data: complex1 = _mm_load_ps(complexVectorPtr); @@ -42,15 +42,15 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const complex2 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); // Arctan to get phase: phase = atan2f4(qValue, iValue); // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. // Compare to 2pi: keepMask = _mm_cmpneq_ps(phase,testVector); phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. + // done with above correction. phase = _mm_mul_ps(phase, vNormalizeFactor); _mm_store_ps((float*)outPtr, phase); outPtr += 4; @@ -89,7 +89,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -97,7 +97,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv __m128 complex1, complex2, iValue, qValue; __m128 mask; __m128 keepMask; - + for (; number < quarterPoints; number++) { // Load IQ data: complex1 = _mm_load_ps(complexVectorPtr); @@ -105,8 +105,8 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv complex2 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); // Arctan to get phase: phase = atan2f4(qValue, iValue); // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. @@ -115,7 +115,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv phase = _mm_and_ps(phase, keepMask); mask = _mm_andnot_ps(keepMask, correctVector); phase = _mm_or_ps(phase, mask); - // done with above correction. + // done with above correction. phase = _mm_mul_ps(phase, vNormalizeFactor); _mm_store_ps((float*)outPtr, phase); outPtr += 4; diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h index 588b532b4..3106edbef 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -28,55 +28,55 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con */ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH const unsigned int quarterPoints = num_points / 4; __m128 vPower = _mm_set_ps1(power); - + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); + + cplxValue1 = _mm_load_ps((float*)aPtr); aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); + + cplxValue2 = _mm_load_ps((float*)aPtr); aPtr += 2; - + // Convert to polar coordinates - + // Arrange in i1i2i3i4 format iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); // Arrange in q1q2q3q4 format qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - + phase = atan2f4(qValue, iValue); // Calculate the Phase - + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - + // Now calculate the power of the polar coordinate data magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - + // Convert back to cartesian coordinates iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - + cPtr += 2; - + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - + cPtr += 2; } - + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h index 8d1959dae..30a77dbc1 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -34,7 +34,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu __m128 input1, input2; const uint64_t quarterPoints = num_points / 4; for(;number < quarterPoints; number++){ - // Load the complex values + // Load the complex values input1 =_mm_load_ps(inputPtr); inputPtr += 4; input2 =_mm_load_ps(inputPtr); @@ -43,30 +43,30 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu // Apply the normalization factor input1 = _mm_mul_ps(input1, invNormalizationFactor); input2 = _mm_mul_ps(input2, invNormalizationFactor); - + // Multiply each value by itself // (r1*r1), (i1*i1), (r2*r2), (i2*i2) input1 = _mm_mul_ps(input1, input1); // (r3*r3), (i3*i3), (r4*r4), (i4*i4) input2 = _mm_mul_ps(input2, input2); - + // Horizontal add, to add (r*r) + (i*i) for each complex value // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) power = _mm_hadd_ps(input1, input2); - + // Calculate the natural log power power = logf4(power); - + // Convert to log10 and multiply by 10.0 power = _mm_mul_ps(power, magScalar); - + // Store the floating point results _mm_store_ps(destPtr, power); - + destPtr += 4; } - - number = quarterPoints*4; + + number = quarterPoints*4; #endif /* LV_HAVE_LIB_SIMDMATH */ // Calculate the FFT for any remaining points @@ -81,10 +81,10 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu const float imag = *inputPtr++ * iNormalizationFactor; *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - + destPtr++; } - + } #endif /* LV_HAVE_SSE3 */ @@ -114,7 +114,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOu *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - + realFFTDataPointsPtr++; } } diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h index fc635f171..27f755351 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -32,19 +32,19 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo magScalar = _mm_div_ps(magScalar, logf4(magScalar)); __m128 invRBW = _mm_set_ps1(iRBW); - + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); __m128 power; __m128 input1, input2; const uint64_t quarterPoints = num_points / 4; for(;number < quarterPoints; number++){ - // Load the complex values + // Load the complex values input1 =_mm_load_ps(inputPtr); inputPtr += 4; input2 =_mm_load_ps(inputPtr); inputPtr += 4; - + // Apply the normalization factor input1 = _mm_mul_ps(input1, invNormalizationFactor); input2 = _mm_mul_ps(input2, invNormalizationFactor); @@ -54,7 +54,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo input1 = _mm_mul_ps(input1, input1); // (r3*r3), (i3*i3), (r4*r4), (i4*i4) input2 = _mm_mul_ps(input2, input2); - + // Horizontal add, to add (r*r) + (i*i) for each complex value // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) power = _mm_hadd_ps(input1, input2); @@ -64,17 +64,17 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo // Calculate the natural log power power = logf4(power); - + // Convert to log10 and multiply by 10.0 power = _mm_mul_ps(power, magScalar); - + // Store the floating point results _mm_store_ps(destPtr, power); - + destPtr += 4; } - - number = quarterPoints*4; + + number = quarterPoints*4; #endif /* LV_HAVE_LIB_SIMDMATH */ // Calculate the FFT for any remaining points for(; number < num_points; number++){ @@ -83,14 +83,14 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) // 75 ohm load assumption // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - + const float real = *inputPtr++ * iNormalizationFactor; const float imag = *inputPtr++ * iNormalizationFactor; *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); destPtr++; } - + } #endif /* LV_HAVE_SSE3 */ @@ -122,7 +122,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* const float imag = *inputPtr++ * iNormalizationFactor; *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - + realFFTDataPointsPtr++; } } diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index 534dc2a25..f206c5e87 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons yh = _mm_set_ps1(lv_cimag(scalar)); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 218c450f8..5c7d15b02 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons yh = _mm_set_ps1(lv_cimag(scalar)); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h index 655075528..e3dedf2fc 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -10,40 +10,40 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - + + in += 4; tp += 4; } - - + + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -64,13 +64,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - asm volatile + + asm volatile ( "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -187,32 +187,32 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) :"rax", "r8", "r9", "r10" ); - - + + int getem = num_bytes % 16; - - + + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - + } return; -} +} #endif #if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; int bound = num_bytes >> 4; int leftovers = num_bytes % 16; - - asm volatile + + asm volatile ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -226,7 +226,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu " movaps 0(%[edx]), %%xmm2\n\t" " movl %[ecx], (%[out])\n\t" " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - + " xorps %%xmm1, %%xmm2\n\t" " jmp .%=L1_test\n\t" " # 4 taps / loop\n\t" @@ -317,28 +317,28 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) ); - - - + + + printf("%d, %d\n", leftovers, bound); - + for(; leftovers > 0; leftovers -= 8) { - - + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - + } - + return; - - - - - + + + + + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 3ae7208a8..e7493413f 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -9,39 +9,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - + + in += 4; tp += 4; } - - + + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -73,7 +73,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result uint32_t intRep[4]; __m128 vec; } halfMask; - + union NegMask { int intRep[4]; __m128 vec; @@ -85,13 +85,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; __m128 zv = {0,0,0,0}; - + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; negMask.intRep[0] = negMask.intRep[2] = 0x80000000; negMask.intRep[1] = negMask.intRep[3] = 0; - + // main loop while(num_bytes >= 4*sizeof(float)){ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index cde9240cc..cb2ac4c67 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -7,44 +7,44 @@ #include -#ifdef LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - + + in += 4; tp += 4; } - + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -61,9 +61,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm + + asm ( "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -175,20 +175,20 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) :"rax", "r8", "r9", "r10" ); - - + + int getem = num_bytes % 16; - - + + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - + } return; - + } #endif @@ -200,7 +200,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes); #if 0 - asm volatile + asm volatile ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -299,28 +299,28 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const : "eax", "ecx", "edx" ); - + int getem = num_bytes % 16; - + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - + } - + return; -#endif +#endif } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 #include static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + lv_32fc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(float)); @@ -336,19 +336,19 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv dotProdVal = _mm_setzero_ps(); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together @@ -368,7 +368,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -379,7 +379,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* + /* __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; __m64 *p_result; @@ -391,7 +391,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const static const __m128i neg = {0x000000000000000080000000}; int i = 0; - + int bound = (num_bytes >> 5); int leftovers = (num_bytes & 24) >> 3; @@ -399,27 +399,27 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const real1 = _mm_sub_ps(real1, real1); im0 = _mm_sub_ps(im0, im0); im1 = _mm_sub_ps(im1, im1); - + for(; i < bound; ++i) { - - + + xmm0 = _mm_load_ps(p_input); xmm1 = _mm_load_ps(p_taps); - + p_input += 4; p_taps += 4; - + xmm2 = _mm_load_ps(p_input); xmm3 = _mm_load_ps(p_taps); - + p_input += 4; p_taps += 4; - + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); xmm5 = _mm_unpackhi_ps(xmm1, xmm3); xmm0 = _mm_unpacklo_ps(xmm0, xmm2); xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - + //imaginary vector from input xmm1 = _mm_unpackhi_ps(xmm0, xmm4); //real vector from input @@ -428,39 +428,39 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const xmm0 = _mm_unpackhi_ps(xmm2, xmm5); //real vector from taps xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - + real0 = _mm_add_ps(xmm4, real0); real1 = _mm_add_ps(xmm5, real1); im0 = _mm_add_ps(xmm6, im0); im1 = _mm_add_ps(xmm7, im1); - + } - - + + real1 = _mm_xor_ps(real1, (__m128)neg); - - + + im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); - + im0 = _mm_add_ps(im0, real0); - + _mm_storel_pi(p_result, im0); - + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - + *result += input[i] * taps[i]; } */ -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index aec8bd716..f79ddb59b 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -24,21 +24,21 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h index 729c1a4ad..a998d6184 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -25,21 +25,21 @@ static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const l const lv_32fc_t* b = bVector; for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h index 2a1bcbce0..2755192e9 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_xor_ps(y, conjugator); // conjugate y - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h index 92f6a051e..09dcd635b 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_xor_ps(y, conjugator); // conjugate y - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h index 2d5f36b27..75eb9173d 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -11,7 +11,7 @@ #include static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -23,31 +23,31 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t int leftovers0 = (num_bytes >> 4) & 1; int leftovers1 = (num_bytes >> 3) & 1; int i = 0; - - - + + + xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); xmm2 = _mm_load_ps((float*)&points[0]); xmm8 = _mm_load1_ps(&scalar); xmm1 = _mm_movelh_ps(xmm1, xmm1); xmm3 = _mm_load_ps((float*)&points[2]); - - + + for(; i < bound - 1; ++i) { - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); xmm3 = _mm_load_ps((float*)&points[2]); - + xmm4 = _mm_mul_ps(xmm4, xmm8); _mm_store_ps(target, xmm4); @@ -55,46 +55,46 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t target += 4; } - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); - - + + points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); - + xmm4 = _mm_mul_ps(xmm4, xmm8); - + _mm_store_ps(target, xmm4); - + target += 4; - + for(i = 0; i < leftovers0; ++i) { - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_sub_ps(xmm1, xmm2); - + points += 2; - + xmm6 = _mm_mul_ps(xmm4, xmm4); xmm4 = _mm_hadd_ps(xmm6, xmm6); xmm4 = _mm_mul_ps(xmm4, xmm8); - + _mm_storeh_pi((__m64*)target, xmm4); target += 2; } for(i = 0; i < leftovers1; ++i) { - + diff = src0[0] - points[0]; sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); @@ -109,13 +109,13 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - unsigned int i = 0; - + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - + target[i] = sq_dist; } } diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h index 6a4a08ca5..b819eaffd 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -10,7 +10,7 @@ #include static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -22,11 +22,11 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* int i = 0; xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); xmm2 = _mm_load_ps((float*)&points[0]); xmm1 = _mm_movelh_ps(xmm1, xmm1); xmm3 = _mm_load_ps((float*)&points[2]); - + for(; i < bound - 1; ++i) { xmm4 = _mm_sub_ps(xmm1, xmm2); @@ -34,9 +34,9 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); xmm3 = _mm_load_ps((float*)&points[2]); @@ -46,41 +46,41 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* target += 4; } - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); - - + + points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); - + _mm_store_ps(target, xmm4); - + target += 4; for(i = 0; i < leftovers0; ++i) { - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_sub_ps(xmm1, xmm2); - + points += 2; - + xmm6 = _mm_mul_ps(xmm4, xmm4); xmm4 = _mm_hadd_ps(xmm6, xmm6); - + _mm_storeh_pi((__m64*)target, xmm4); target += 2; } for(i = 0; i < leftovers1; ++i) { - + diff = src0[0] - points[0]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); @@ -95,13 +95,13 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - unsigned int i = 0; - + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - + target[i] = sq_dist; } } diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h index 558142869..8f4123d71 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h index d8afd218c..b3a8ab201 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h index dcd63d98e..e5330847b 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -23,12 +23,12 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - + + cVal = _mm_and_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h index 243e8178c..24045894c 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -23,12 +23,12 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - + + cVal = _mm_or_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h index b88848096..71ae027d3 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -39,9 +39,9 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n _mm_store_si128((__m128i*)inputPtr, output); inputPtr += 4; } - + // Byteswap any remaining points: - number = quarterPoints*4; + number = quarterPoints*4; for(; number < num_points; number++){ uint32_t outputVal = *inputPtr; outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); @@ -64,7 +64,7 @@ static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned in for(point = 0; point < num_points; point++){ uint32_t output = *inputPtr; output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - + *inputPtr = output; inputPtr++; } diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h index 2126e4f95..11d51702b 100644 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const double* inputVectorPtr = (const double*)inputVector; float* outputVectorPtr = outputVector; __m128 ret, ret2; @@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double for(;number < quarterPoints; number++){ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - + ret = _mm_cvtpd_ps(inputVal1); ret2 = _mm_cvtpd_ps(inputVal2); @@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (float)(inputVector[number]); } diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h index 5c323230a..31dc5b5fe 100644 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const double* inputVectorPtr = (const double*)inputVector; float* outputVectorPtr = outputVector; __m128 ret, ret2; @@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - + ret = _mm_cvtpd_ps(inputVal1); ret2 = _mm_cvtpd_ps(inputVal2); @@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (float)(inputVector[number]); } diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h index 61a704c52..33aae6d10 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -23,12 +23,12 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe __m128d aVal, bVal, cVal; for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); + + aVal = _mm_load_pd(aPtr); bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - + + cVal = _mm_max_pd(aVal, bVal); + _mm_store_pd(cPtr,cVal); // Store the results back into the C container aPtr += 2; diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h index 148b72c59..25d8b4c98 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -23,12 +23,12 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe __m128d aVal, bVal, cVal; for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); + + aVal = _mm_load_pd(aPtr); bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - + + cVal = _mm_min_pd(aVal, bVal); + _mm_store_pd(cPtr,cVal); // Store the results back into the C container aPtr += 2; diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h index d4fc74a6e..3d1d87623 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -34,7 +34,7 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n output = _mm_or_si128(output, byte2); byte3 = _mm_and_si128(byte3, byte3mask); output = _mm_or_si128(output, byte3); - + // Reorder the two words output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); @@ -42,17 +42,17 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n _mm_store_si128((__m128i*)inputPtr, output); inputPtr += 4; } - + // Byteswap any remaining points: - number = halfPoints*2; + number = halfPoints*2; for(; number < num_points; number++){ uint32_t output1 = *inputPtr; uint32_t output2 = inputPtr[1]; - + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - + *inputPtr++ = output2; *inputPtr++ = output1; } @@ -71,11 +71,11 @@ static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned in for(point = 0; point < num_points; point++){ uint32_t output1 = *inputPtr; uint32_t output2 = inputPtr[1]; - + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - + *inputPtr++ = output2; *inputPtr++ = output1; } diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 4683f1e38..7d7359ccf 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -11,7 +11,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { //const uint32_t* valueVector = (const uint32_t*)&value; - + // This is faster than a lookup table //uint32_t retVal = valueVector[0]; uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h index 7f2623ac6..02a7f356e 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h index 3cd6bb67c..8bb2c0d1a 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1( iScalar ); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h index b723c6f8b..d82da59fb 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -20,7 +20,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; __m128 iFloatValue, qFloatValue; const float iScalar= 1.0 / scalar; @@ -71,7 +71,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -90,7 +90,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; __m128 invScalar = _mm_set_ps1(1.0/scalar); @@ -103,7 +103,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float floatBuffer[1] = (float)(complexVectorPtr[1]); floatBuffer[2] = (float)(complexVectorPtr[2]); floatBuffer[3] = (float)(complexVectorPtr[3]); - + floatBuffer[4] = (float)(complexVectorPtr[4]); floatBuffer[5] = (float)(complexVectorPtr[5]); floatBuffer[6] = (float)(complexVectorPtr[6]); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h index 74073f5a6..b2c15d3a3 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; __m128 iFloatValue; const float iScalar= 1.0 / scalar; @@ -57,7 +57,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -75,7 +75,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iValue; const float iScalar= 1.0 / scalar; @@ -88,7 +88,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; iValue = _mm_load_ps(floatBuffer); @@ -104,7 +104,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 0c280eb6e..f85fdb999 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -23,15 +23,15 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - + for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); - + // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); - + // Calculate the complex conjugate of the cr + ci j values y = _mm_sign_epi16(y, conjugateSign); @@ -47,7 +47,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect b += 4; c += 4; } - + number = quarterPoints * 4; int16_t* c16Ptr = (int16_t*)&cVector[number]; int8_t* a8Ptr = (int8_t*)&aVector[number]; diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index a2c2b04f6..4b16171ce 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -80,7 +80,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* float bImag = (float)*b8Ptr++; lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; - + *cFloatPtr++ = lv_creal(temp) / scalar; *cFloatPtr++ = lv_cimag(temp) / scalar; } @@ -109,7 +109,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* float bImag = (float)*b8Ptr++; lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; - + *cPtr++ = (lv_creal(temp) * invScalar); *cPtr++ = (lv_cimag(temp) * invScalar); } -- cgit From 231b177ae5c2ef7726cfc18b709ba850c021cf1b Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Mon, 7 May 2012 21:25:28 -0400 Subject: volk: fixed popcnt. --- volk/include/volk/volk_64u_popcnt_a.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 7d7359ccf..5e68ed208 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -14,7 +14,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value // This is faster than a lookup table //uint32_t retVal = valueVector[0]; - uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); + uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -24,7 +24,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value uint64_t retVal64 = retVal; //retVal = valueVector[1]; - retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); + retVal = (uint32_t)((value & 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; -- cgit From ee0ed0b725b19e223f8f7f3ff27d4374c5fcb35a Mon Sep 17 00:00:00 2001 From: Nick McCarthy Date: Fri, 11 May 2012 16:29:08 -0700 Subject: volk: add SIMD implementation for fixed phase rotation --- .../volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h | 74 ++++++ .../volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 261 +++++++++++++++++++++ 2 files changed, 335 insertions(+) create mode 100644 volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h create mode 100644 volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h new file mode 100644 index 000000000..eee9f0064 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h @@ -0,0 +1,74 @@ +#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H + + +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points); + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points); + +} + + + + +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_AVX +#include + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + + + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points); + +} + +#endif /* LV_HAVE_AVX */ + + + + + + + + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h new file mode 100644 index 000000000..f8076adc3 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -0,0 +1,261 @@ +#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H + + +#include +#include +#define ROTATOR_RELOAD 512 + + +#ifdef LV_HAVE_GENERIC + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + int i = 0; + int j = 0; + for(i = 0; i < (int)(num_points/ROTATOR_RELOAD); ++i) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= phase_inc; + } + (*phase) /= abs((*phase)); + } + for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= phase_inc; + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; + + int i, j = 0; + + for(i = 0; i < 2; ++i) { + phase_Ptr[i] *= incr; + incr *= (phase_inc); + } + + /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); + printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); + printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); + printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); + printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ + __m128 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm_loadu_ps((float*)phase_Ptr); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + + const unsigned int halfPoints = num_points / 2; + + + for(i = 0; i < (int)(halfPoints/ROTATOR_RELOAD); i++) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + phase_Val = _mm_div_ps(phase_Val, tmp1); + } + for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + + _mm_storeu_ps((float*)phase_Ptr, phase_Val); + for(i = 0; i < num_points%2; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (phase_inc); + } + + (*phase) = phase_Ptr[0]; + +} + +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_AVX +#include + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + + + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + + int i, j = 0; + + for(i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (phase_inc); + } + + /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); + printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); + printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); + printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); + printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ + __m256 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); + inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + zeros = _mm256_set1_ps(0.0); + negated = _mm256_set1_ps(-1.0); + const unsigned int fourthPoints = num_points / 4; + + + for(i = 0; i < (int)(fourthPoints/ROTATOR_RELOAD); i++) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = _mm256_mul_ps(aVal, yl); + tmp1p = _mm256_mul_ps(phase_Val, ylp); + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_addsub_ps(tmp1, tmp2); + phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + phase_Val = _mm256_div_ps(phase_Val, tmp1); + } + for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = _mm256_mul_ps(aVal, yl); + + tmp1p = _mm256_mul_ps(phase_Val, ylp); + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_addsub_ps(tmp1, tmp2); + phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); + for(i = 0; i < num_points%4; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (phase_inc); + } + + (*phase) = phase_Ptr[0]; + +} + +#endif /* LV_HAVE_AVX */ + + + + + + + + +#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */ -- cgit From ff3d467ba8bf8a3e21dd3dfd448262f6a77b3e4d Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Sat, 12 May 2012 08:51:34 -0700 Subject: volk: fix some signedness and unused variable warnings --- volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h index f8076adc3..80c55e75f 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -4,6 +4,7 @@ #include #include +#include #define ROTATOR_RELOAD 512 @@ -21,9 +22,9 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ *phase = lv_cmake(1.0, 0.0); - int i = 0; + unsigned int i = 0; int j = 0; - for(i = 0; i < (int)(num_points/ROTATOR_RELOAD); ++i) { + for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { for(j = 0; j < ROTATOR_RELOAD; ++j) { *outVector++ = *inVector++ * (*phase); (*phase) *= phase_inc; @@ -49,7 +50,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; - int i, j = 0; + unsigned int i, j = 0; for(i = 0; i < 2; ++i) { phase_Ptr[i] *= incr; @@ -61,7 +62,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m128 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm_loadu_ps((float*)phase_Ptr); inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); @@ -69,7 +70,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector const unsigned int halfPoints = num_points / 2; - for(i = 0; i < (int)(halfPoints/ROTATOR_RELOAD); i++) { + for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm_load_ps((float*)aPtr); @@ -161,7 +162,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; - int i, j = 0; + unsigned int i, j = 0; for(i = 0; i < 4; ++i) { phase_Ptr[i] *= incr; @@ -173,7 +174,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m256 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); @@ -182,7 +183,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c const unsigned int fourthPoints = num_points / 4; - for(i = 0; i < (int)(fourthPoints/ROTATOR_RELOAD); i++) { + for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm256_load_ps((float*)aPtr); -- cgit From 9e2e896e9d4dbc4627702cde82a48e9ee5136f26 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 13 Jun 2012 14:53:41 -0400 Subject: filter: process 4 vectors each time in volk dot_prod to speed up fir filters. This makes the volk version of the SSE FIR filter the same speed as using the hand-crafted float_dotprod from before. --- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 48 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 448b2fdc0..8753ff615 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -37,26 +37,47 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 4*4; + bPtr += 4*4; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; @@ -66,6 +87,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* number = quarterPoints * 4; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; -- cgit From 7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 13 Jun 2012 17:49:44 -0400 Subject: volk: dot_produce for floats does 16 at a time. This was done to make this have the same performance as float_dotprod from before. This makes all flavors of the 32f dotprod work the same way. Because it's expecting the input to have 4x more samples than specified, it's making qa for these fail. --- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 84 +++++++++----- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 151 +++++++++++++++++-------- 2 files changed, 160 insertions(+), 75 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 8753ff615..0543227b0 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -8,14 +8,17 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_points; number++){ + for(number = 0; number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -28,10 +31,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -67,8 +70,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - aPtr += 4*4; - bPtr += 4*4; + aPtr += 16; + bPtr += 16; } dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); @@ -84,8 +87,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); @@ -102,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); - cVal = _mm_mul_ps(aVal, bVal); + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - aPtr += 4; - bPtr += 4; + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -149,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * #include -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + const unsigned int sixteenthPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -196,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ + number = sixteenthPoints * 4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 3b7284b57..dfafe2239 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -1,20 +1,24 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#include #include #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_points; number++){ + for(number = 0; number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -27,43 +31,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -77,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -124,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * #include -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + const unsigned int sixteenthPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -141,15 +193,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float for(;number < sixteenthPoints; number++){ - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); @@ -171,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ + number = sixteenthPoints * 4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } -- cgit From c24cabd47e2aaadf279c19aec73b311dd7ddce6c Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 14 Jun 2012 12:25:04 -0400 Subject: volk: fixes for 32f dot_prod Accepts num_points like everything else and handles splitting up numbers itself, not expected to be done externally. Adds AVX version, both aligned and unaligned. --- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 105 ++++++++++++++++++------- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 102 ++++++++++++++++++------ 2 files changed, 154 insertions(+), 53 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 0543227b0..c26fd5e7c 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -8,17 +8,14 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -31,10 +28,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -49,7 +46,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* __m128 dotProdVal2 = _mm_setzero_ps(); __m128 dotProdVal3 = _mm_setzero_ps(); - for(;number < quarterPoints; number++){ + for(;number < sixteenthPoints; number++){ a0Val = _mm_load_ps(aPtr); a1Val = _mm_load_ps(aPtr+4); @@ -87,11 +84,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints*4; - for(;number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + number = sixteenthPoints*16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -105,9 +99,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -122,7 +116,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * __m128 dotProdVal2 = _mm_setzero_ps(); __m128 dotProdVal3 = _mm_setzero_ps(); - for(;number < quarterPoints; number++){ + for(;number < sixteenthPoints; number++){ a0Val = _mm_load_ps(aPtr); a1Val = _mm_load_ps(aPtr+4); @@ -159,11 +153,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints*4; - for(;number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + number = sixteenthPoints*16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -176,9 +167,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * #include -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -223,11 +214,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 4; - for(;number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + number = sixteenthPoints * 16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -236,4 +224,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index dfafe2239..f9ae15094 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -8,17 +8,14 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -31,10 +28,10 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -49,7 +46,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* __m128 dotProdVal2 = _mm_setzero_ps(); __m128 dotProdVal3 = _mm_setzero_ps(); - for(;number < quarterPoints; number++){ + for(;number < sixteenthPoints; number++){ a0Val = _mm_load_ps(aPtr); a1Val = _mm_load_ps(aPtr+4); @@ -87,8 +84,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints*4; - for(;number < num_4x_points; number++){ + number = sixteenthPoints*16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); @@ -105,9 +102,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -122,7 +119,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * __m128 dotProdVal2 = _mm_setzero_ps(); __m128 dotProdVal3 = _mm_setzero_ps(); - for(;number < quarterPoints; number++){ + for(;number < sixteenthPoints; number++){ a0Val = _mm_load_ps(aPtr); a1Val = _mm_load_ps(aPtr+4); @@ -159,11 +156,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints*4; - for(;number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + number = sixteenthPoints*16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -176,9 +170,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * #include -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_4x_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; @@ -223,11 +217,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 4; - for(;number < num_4x_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); + number = sixteenthPoints * 16; + for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -236,4 +227,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ -- cgit From 1a5a1d4e2e0374f73cef5fbc9a01bf82d8b7d892 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 14 Jun 2012 16:12:00 -0400 Subject: filter: added a ccf Volk dot product to use with ccf filters and used it in fir_filter_ccf. Produces improved results to previous version. --- volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h | 111 ++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h new file mode 100644 index 000000000..109b787e8 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr= taps; + unsigned int number = 0; + + *realpt = 0; + *imagpt = 0; + + for(number = 0; number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + + x0Val = _mm_load_ps(bPtr); + x1Val = _mm_load_ps(bPtr); + x2Val = _mm_load_ps(bPtr+4); + x3Val = _mm_load_ps(bPtr+4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + + +#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/ -- cgit From 5585c71229cfa7886e0bd090828cd1f5104f6b27 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 15 Jun 2012 08:43:20 -0400 Subject: filter: adding ssc and fsf versions of filter with associated new Volk kernels. These routines work and pass QA. They could use some performance work. the FSF is just slightly slower than before; the SCC version is more noticably slower. Both could benefit, probably, by using SSE2 intrinsics to handle the shorts. --- volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 122 ++++++++++++++++++++++ volk/include/volk/volk_32f_x2_dot_prod_16i_a.h | 98 +++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_16i_a.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h new file mode 100644 index 000000000..940aa5de7 --- /dev/null +++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H +#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { + + static const int N_UNROLL = 4; + + lv_32fc_t acc0 = 0; + lv_32fc_t acc1 = 0; + lv_32fc_t acc2 = 0; + lv_32fc_t acc3 = 0; + + unsigned i = 0; + unsigned n = (num_points / N_UNROLL) * N_UNROLL; + + for(i = 0; i < n; i += N_UNROLL) { + acc0 += taps[i + 0] * (float)input[i + 0]; + acc1 += taps[i + 1] * (float)input[i + 1]; + acc2 += taps[i + 2] * (float)input[i + 2]; + acc3 += taps[i + 3] * (float)input[i + 3]; + } + + for(; i < num_points; i++) { + acc0 += taps[i] * (float)input[i]; + } + + *result = acc0 + acc1 + acc2 + acc3; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); + m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + + +#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h new file mode 100644 index 000000000..961c2418c --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h @@ -0,0 +1,98 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H +#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (int16_t)dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/ -- cgit From a0ffd48d73eda63d5fa3a9ded1dc0f34b116da9d Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Wed, 20 Jun 2012 16:46:29 -0400 Subject: volk: added missing avx header include --- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 2 ++ volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index c26fd5e7c..067c33ad8 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -226,6 +226,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float #ifdef LV_HAVE_AVX +#include + static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index f9ae15094..ab33a2587 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -229,6 +229,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float #ifdef LV_HAVE_AVX +#include + static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; -- cgit From 81589710a52c0e1b087f28ac54e83b9cbfa477d1 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 22 Jun 2012 18:58:33 -0400 Subject: volk: fixing some volk kernels. This should fix some problems with gr-filter QA tests. Also removes some warnings. --- .../volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 4 +-- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 33 ++-------------------- 2 files changed, 4 insertions(+), 33 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h index 80c55e75f..05732b1ea 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -174,12 +174,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); - zeros = _mm256_set1_ps(0.0); - negated = _mm256_set1_ps(-1.0); const unsigned int fourthPoints = num_points / 4; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index cb2ac4c67..166a883a7 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - in += 4; tp += 4; - } - res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - } - } #endif /*LV_HAVE_GENERIC*/ @@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const ); - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - + if(((num_bytes >> 3) & 1)) { *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - } return; @@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - if((num_bytes >> 2) != 0) { + if(((num_bytes >> 3) & 1) != 0) { dotProduct += (*a) * (*b); } @@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv #include static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; __m64 *p_result; @@ -442,12 +420,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const } - - - real1 = _mm_xor_ps(real1, (__m128)neg); - im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); @@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const *result += input[i] * taps[i]; } - */ } #endif /*LV_HAVE_SSE4_1*/ -- cgit From feaa399ba51e8959e92cf9b73375db1c6b638b3f Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 25 Jun 2012 10:50:43 -0700 Subject: volk: replace (__m128) with volk cast for portability --- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index 166a883a7..caef3e6f0 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -420,7 +420,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const } - real1 = _mm_xor_ps(real1, (__m128)neg); + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); -- cgit From f2021f48926741abb62f9a54744191c81e5c950d Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Tue, 3 Jul 2012 17:31:07 -0700 Subject: volk: don't initialize phase in rotator --- volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h index 80c55e75f..08db99373 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -21,7 +21,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ - *phase = lv_cmake(1.0, 0.0); unsigned int i = 0; int j = 0; for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { @@ -44,7 +43,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVecto #include static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ - *phase = lv_cmake(1.0, 0.0); lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; @@ -156,7 +154,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ - *phase = lv_cmake(1.0, 0.0); lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; -- cgit From 7c0f5550d06e9c5e2ab413e3e2b95a843a435135 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Wed, 4 Jul 2012 11:16:02 -0700 Subject: volk: fix volk_32f_x2_dot_prod_32f_u_sse tail case --- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index ab33a2587..c43d229df 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; -- cgit From 61dd97409abde2412595fa762c0c4161863604f4 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Wed, 4 Jul 2012 11:17:48 -0700 Subject: volk: use loadu for unaligned volk_32f_x2_dot_prod_32f_u_sse* --- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index c43d229df..b24e8b1f7 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -118,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -184,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float for(;number < sixteenthPoints; number++){ - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); -- cgit From bf073d84627f728d2241bbcf48075bb3257b58ce Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 15 Jul 2012 14:28:04 -0400 Subject: volk: adding unaligned byteswap kernel. --- volk/include/volk/volk_16u_byteswap_u.h | 76 +++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 volk/include/volk/volk_16u_byteswap_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h new file mode 100644 index 000000000..37bed7fce --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -0,0 +1,76 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_u_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_u_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ -- cgit From 209e98f351f873450c1b2cc0f5bb7877f32e83e4 Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Mon, 16 Jul 2012 14:37:27 -0700 Subject: Comment out orc support for volk_16u_byteswap_u --- volk/include/volk/volk_16u_byteswap_u.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h index 37bed7fce..ffdd5ba33 100644 --- a/volk/include/volk/volk_16u_byteswap_u.h +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -60,7 +60,8 @@ static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned in } #endif /* LV_HAVE_GENERIC */ -#ifdef LV_HAVE_ORC +//#ifdef LV_HAVE_ORC +#if 0 /*! \brief Byteswaps (in-place) an unaligned vector of int16_t's. \param intsToSwap The vector of data to byte swap -- cgit From cf8b2a8ff57ff6d3fd4b78a897854cc9a6f49fe1 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 17 Jul 2012 10:29:35 -0700 Subject: volk: fix for win64 MSVC not having MMX support 1) For the machine defs that will pass on windows, generate a machine with mmx and without using mmx| 2) In the cmakelists, we overrule MMX arch on MSVC 64. Also overrule redundant machines for when MMX does pass. 3) Set LV_HAVE_SSE && LV_HAVE_MMX for volk_16i_32fc_dot_prod_32fc_a. Afterall, it is calling into MMX intrinsics as well. --- volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h index 940aa5de7..1f6554af8 100644 --- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h @@ -37,7 +37,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, cons #endif /*LV_HAVE_GENERIC*/ -#ifdef LV_HAVE_SSE +#if LV_HAVE_SSE && LV_HAVE_MMX static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { @@ -116,7 +116,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const *result = *(lv_32fc_t*)(&res[0]); } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ -- cgit From 3d868e178994f2e147bbee2c26c09fe283ab6992 Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Tue, 17 Jul 2012 15:36:43 -0700 Subject: volk: entirely remove commented out ORC clause for volk_16u_byteswap_u --- volk/include/volk/volk_16u_byteswap_u.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h index ffdd5ba33..8ef627a62 100644 --- a/volk/include/volk/volk_16u_byteswap_u.h +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -60,18 +60,4 @@ static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned in } #endif /* LV_HAVE_GENERIC */ -//#ifdef LV_HAVE_ORC -#if 0 -/*! - \brief Byteswaps (in-place) an unaligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -extern void volk_16u_byteswap_u_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_u_orc_impl(intsToSwap, num_points); -} -#endif /* LV_HAVE_ORC */ - - #endif /* INCLUDED_volk_16u_byteswap_u_H */ -- cgit From 0e2f0c6473530ef6823a27f4f294826c777afe06 Mon Sep 17 00:00:00 2001 From: Tim O'Shea Date: Wed, 29 Aug 2012 20:27:18 -0400 Subject: adding gr_endian_swap block --- volk/include/volk/volk_32u_byteswap_u.h | 77 +++++++++++++++++++++++++++++ volk/include/volk/volk_64u_byteswap_u.h | 88 +++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 volk/include/volk/volk_32u_byteswap_u.h create mode 100644 volk/include/volk/volk_64u_byteswap_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h new file mode 100644 index 000000000..e27d1f03d --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_u_H +#define INCLUDED_volk_32u_byteswap_u_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_u_H */ diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h new file mode 100644 index 000000000..41a4a3130 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_u.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_u_H +#define INCLUDED_volk_64u_byteswap_u_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_u_H */ -- cgit From e826097e09fdfb04d14bf87861646b88229db881 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 13 Jan 2013 13:51:46 -0800 Subject: gras: support changeset for 3.6.4 used volk from next branch cf5c930d89ac89ba5a0da4a616c88d3c37e018ae for grextras support (it uses the dispatcher) empty stubs for the gr_basic_block msg passing. This is going to be difficult to figure out. The alias stuff may or may not be related most qa pass, there seems to be some additional issues, will be working through them on futher commits Conflicts: gnuradio-core/CMakeLists.txt gnuradio-core/src/lib/runtime/CMakeLists.txt gnuradio-core/src/lib/runtime/gr_block.cc gnuradio-core/src/lib/runtime/gr_block.h gnuradio-core/src/lib/runtime/gr_hier_block2.h gnuradio-core/src/lib/runtime/gr_top_block.h gnuradio-core/src/python/gnuradio/gr/__init__.py gr-audio/examples/c++/CMakeLists.txt gr-fcd/examples/c++/CMakeLists.txt grc/python/Port.py --- volk/include/volk/volk_prefs.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 83d9baf89..690e5f99f 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -2,23 +2,26 @@ #define INCLUDED_VOLK_PREFS_H #include +#include __VOLK_DECL_BEGIN -struct volk_arch_pref { - char name[128]; - char arch[32]; -}; +typedef struct volk_arch_pref +{ + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl +} volk_arch_pref_t; //////////////////////////////////////////////////////////////////////// // get path to volk_config profiling info //////////////////////////////////////////////////////////////////////// -VOLK_API void get_config_path(char *); +VOLK_API void volk_get_config_path(char *); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -VOLK_API int load_preferences(struct volk_arch_pref **); +VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); __VOLK_DECL_END -- cgit