From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Tue, 7 Dec 2010 18:50:28 -0500
Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This
is a new SIMD library.
It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time.
---
volk/include/Makefile.am | 23 +
volk/include/volk/Makefile.am | 133 ++++++
volk/include/volk/archs.xml | 133 ++++++
volk/include/volk/emit_omnilog.py | 13 +
volk/include/volk/make_c.py | 76 ++++
volk/include/volk/make_config_fixed.py | 21 +
volk/include/volk/make_config_in.py | 13 +
volk/include/volk/make_cpuid_generic_c.py | 38 ++
volk/include/volk/make_cpuid_h.py | 25 ++
volk/include/volk/make_cpuid_powerpc_c.py | 45 ++
volk/include/volk/make_cpuid_x86_c.py | 93 ++++
volk/include/volk/make_environment_init_c.py | 33 ++
volk/include/volk/make_environment_init_h.py | 18 +
volk/include/volk/make_h.py | 28 ++
volk/include/volk/make_init_c.py | 42 ++
volk/include/volk/make_init_h.py | 26 ++
volk/include/volk/make_mktables.py | 33 ++
volk/include/volk/make_proccpu_sim.py | 25 ++
volk/include/volk/make_registry.py | 62 +++
volk/include/volk/make_runtime.py | 34 ++
volk/include/volk/make_runtime_c.py | 47 +++
volk/include/volk/make_set_simd.py | 202 +++++++++
volk/include/volk/make_typedefs.py | 23 +
volk/include/volk/volk_16s_add_quad_aligned16.h | 136 ++++++
.../volk/volk_16s_branch_4_state_8_aligned16.h | 194 +++++++++
volk/include/volk/volk_16s_convert_32f_aligned16.h | 119 ++++++
.../volk/volk_16s_convert_32f_unaligned16.h | 122 ++++++
volk/include/volk/volk_16s_convert_8s_aligned16.h | 69 +++
.../include/volk/volk_16s_convert_8s_unaligned16.h | 71 ++++
volk/include/volk/volk_16s_max_star_aligned16.h | 108 +++++
.../volk/volk_16s_max_star_horizontal_aligned16.h | 130 ++++++
.../volk_16s_permute_and_scalar_add_aligned16.h | 139 ++++++
.../volk/volk_16s_quad_max_star_aligned16.h | 191 +++++++++
.../volk/volk_16sc_deinterleave_16s_aligned16.h | 146 +++++++
.../volk/volk_16sc_deinterleave_32f_aligned16.h | 95 +++++
.../volk_16sc_deinterleave_real_16s_aligned16.h | 120 ++++++
.../volk_16sc_deinterleave_real_32f_aligned16.h | 125 ++++++
.../volk_16sc_deinterleave_real_8s_aligned16.h | 83 ++++
.../volk/volk_16sc_magnitude_16s_aligned16.h | 179 ++++++++
.../volk/volk_16sc_magnitude_32f_aligned16.h | 163 +++++++
volk/include/volk/volk_16u_byteswap_aligned16.h | 65 +++
volk/include/volk/volk_32f_accumulator_aligned16.h | 67 +++
volk/include/volk/volk_32f_add_aligned16.h | 69 +++
.../volk_32f_calc_spectral_noise_floor_aligned16.h | 167 ++++++++
volk/include/volk/volk_32f_convert_16s_aligned16.h | 110 +++++
.../volk/volk_32f_convert_16s_unaligned16.h | 113 +++++
volk/include/volk/volk_32f_convert_32s_aligned16.h | 106 +++++
.../volk/volk_32f_convert_32s_unaligned16.h | 109 +++++
volk/include/volk/volk_32f_convert_64f_aligned16.h | 70 +++
.../volk/volk_32f_convert_64f_unaligned16.h | 70 +++
volk/include/volk/volk_32f_convert_8s_aligned16.h | 117 ++++++
.../include/volk/volk_32f_convert_8s_unaligned16.h | 120 ++++++
volk/include/volk/volk_32f_divide_aligned16.h | 69 +++
volk/include/volk/volk_32f_dot_prod_aligned16.h | 184 ++++++++
volk/include/volk/volk_32f_dot_prod_unaligned16.h | 184 ++++++++
volk/include/volk/volk_32f_fm_detect_aligned16.h | 120 ++++++
volk/include/volk/volk_32f_index_max_aligned16.h | 148 +++++++
.../volk/volk_32f_interleave_16sc_aligned16.h | 155 +++++++
.../volk/volk_32f_interleave_32fc_aligned16.h | 75 ++++
volk/include/volk/volk_32f_max_aligned16.h | 71 ++++
volk/include/volk/volk_32f_min_aligned16.h | 71 ++++
volk/include/volk/volk_32f_multiply_aligned16.h | 69 +++
volk/include/volk/volk_32f_normalize_aligned16.h | 66 +++
volk/include/volk/volk_32f_power_aligned16.h | 144 +++++++
volk/include/volk/volk_32f_sqrt_aligned16.h | 64 +++
volk/include/volk/volk_32f_stddev_aligned16.h | 144 +++++++
.../volk/volk_32f_stddev_and_mean_aligned16.h | 169 ++++++++
volk/include/volk/volk_32f_subtract_aligned16.h | 67 +++
volk/include/volk/volk_32f_sum_of_poly_aligned16.h | 151 +++++++
.../volk/volk_32fc_32f_multiply_aligned16.h | 82 ++++
.../volk/volk_32fc_32f_power_32fc_aligned16.h | 109 +++++
volk/include/volk/volk_32fc_atan2_32f_aligned16.h | 158 +++++++
.../volk/volk_32fc_conjugate_dot_prod_aligned16.h | 344 +++++++++++++++
.../volk/volk_32fc_deinterleave_32f_aligned16.h | 75 ++++
.../volk/volk_32fc_deinterleave_64f_aligned16.h | 78 ++++
.../volk_32fc_deinterleave_real_16s_aligned16.h | 80 ++++
.../volk_32fc_deinterleave_real_32f_aligned16.h | 68 +++
.../volk_32fc_deinterleave_real_64f_aligned16.h | 66 +++
volk/include/volk/volk_32fc_dot_prod_aligned16.h | 468 +++++++++++++++++++++
volk/include/volk/volk_32fc_index_max_aligned16.h | 215 ++++++++++
.../volk/volk_32fc_magnitude_16s_aligned16.h | 146 +++++++
.../volk/volk_32fc_magnitude_32f_aligned16.h | 121 ++++++
volk/include/volk/volk_32fc_multiply_aligned16.h | 78 ++++
...olk_32fc_power_spectral_density_32f_aligned16.h | 134 ++++++
.../volk/volk_32fc_power_spectrum_32f_aligned16.h | 126 ++++++
.../include/volk/volk_32fc_square_dist_aligned16.h | 112 +++++
.../volk_32fc_square_dist_scalar_mult_aligned16.h | 126 ++++++
volk/include/volk/volk_32s_and_aligned16.h | 69 +++
volk/include/volk/volk_32s_convert_32f_aligned16.h | 73 ++++
.../volk/volk_32s_convert_32f_unaligned16.h | 75 ++++
volk/include/volk/volk_32s_or_aligned16.h | 69 +++
volk/include/volk/volk_32u_byteswap_aligned16.h | 77 ++++
volk/include/volk/volk_32u_popcnt_aligned16.h | 36 ++
volk/include/volk/volk_64f_convert_32f_aligned16.h | 67 +++
.../volk/volk_64f_convert_32f_unaligned16.h | 67 +++
volk/include/volk/volk_64f_max_aligned16.h | 71 ++++
volk/include/volk/volk_64f_min_aligned16.h | 71 ++++
volk/include/volk/volk_64u_byteswap_aligned16.h | 88 ++++
volk/include/volk/volk_64u_popcnt_aligned16.h | 74 ++++
volk/include/volk/volk_8s_convert_16s_aligned16.h | 71 ++++
.../include/volk/volk_8s_convert_16s_unaligned16.h | 73 ++++
volk/include/volk/volk_8s_convert_32f_aligned16.h | 92 ++++
.../include/volk/volk_8s_convert_32f_unaligned16.h | 94 +++++
.../volk/volk_8sc_deinterleave_16s_aligned16.h | 77 ++++
.../volk/volk_8sc_deinterleave_32f_aligned16.h | 164 ++++++++
.../volk_8sc_deinterleave_real_16s_aligned16.h | 66 +++
.../volk_8sc_deinterleave_real_32f_aligned16.h | 133 ++++++
.../volk/volk_8sc_deinterleave_real_8s_aligned16.h | 67 +++
.../volk_8sc_multiply_conjugate_16sc_aligned16.h | 102 +++++
.../volk_8sc_multiply_conjugate_32fc_aligned16.h | 122 ++++++
volk/include/volk/volk_common.h | 18 +
volk/include/volk/volk_complex.h | 71 ++++
volk/include/volk/volk_regexp.py | 8 +
volk/include/volk/volk_register.py | 279 ++++++++++++
114 files changed, 11370 insertions(+)
create mode 100644 volk/include/Makefile.am
create mode 100644 volk/include/volk/Makefile.am
create mode 100644 volk/include/volk/archs.xml
create mode 100644 volk/include/volk/emit_omnilog.py
create mode 100644 volk/include/volk/make_c.py
create mode 100644 volk/include/volk/make_config_fixed.py
create mode 100644 volk/include/volk/make_config_in.py
create mode 100644 volk/include/volk/make_cpuid_generic_c.py
create mode 100644 volk/include/volk/make_cpuid_h.py
create mode 100644 volk/include/volk/make_cpuid_powerpc_c.py
create mode 100644 volk/include/volk/make_cpuid_x86_c.py
create mode 100644 volk/include/volk/make_environment_init_c.py
create mode 100644 volk/include/volk/make_environment_init_h.py
create mode 100644 volk/include/volk/make_h.py
create mode 100644 volk/include/volk/make_init_c.py
create mode 100644 volk/include/volk/make_init_h.py
create mode 100644 volk/include/volk/make_mktables.py
create mode 100644 volk/include/volk/make_proccpu_sim.py
create mode 100644 volk/include/volk/make_registry.py
create mode 100644 volk/include/volk/make_runtime.py
create mode 100644 volk/include/volk/make_runtime_c.py
create mode 100644 volk/include/volk/make_set_simd.py
create mode 100644 volk/include/volk/make_typedefs.py
create mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h
create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h
create mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h
create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
create mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h
create mode 100644 volk/include/volk/volk_32f_add_aligned16.h
create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_divide_aligned16.h
create mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h
create mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h
create mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h
create mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h
create mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h
create mode 100644 volk/include/volk/volk_32f_max_aligned16.h
create mode 100644 volk/include/volk/volk_32f_min_aligned16.h
create mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h
create mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h
create mode 100644 volk/include/volk/volk_32f_power_aligned16.h
create mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h
create mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h
create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
create mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h
create mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
create mode 100644 volk/include/volk/volk_32s_and_aligned16.h
create mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_32s_or_aligned16.h
create mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h
create mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h
create mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_64f_max_aligned16.h
create mode 100644 volk/include/volk/volk_64f_min_aligned16.h
create mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
create mode 100644 volk/include/volk/volk_common.h
create mode 100644 volk/include/volk/volk_complex.h
create mode 100644 volk/include/volk/volk_regexp.py
create mode 100755 volk/include/volk/volk_register.py
(limited to 'volk/include')
diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am
new file mode 100644
index 000000000..375d1a7d5
--- /dev/null
+++ b/volk/include/Makefile.am
@@ -0,0 +1,23 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+SUBDIRS = volk
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
new file mode 100644
index 000000000..fd59ab795
--- /dev/null
+++ b/volk/include/volk/Makefile.am
@@ -0,0 +1,133 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+ourinclude_HEADERS = \
+ volk_complex.h \
+ volk_common.h \
+ volk_config_fixed.h \
+ volk_runtime.h \
+ volk_config.h \
+ volk_tables.h \
+ volk_typedefs.h \
+ volk.h \
+ volk_cpu.h \
+ volk_environment_init.h \
+ volk_16s_add_quad_aligned16.h \
+ volk_16s_branch_4_state_8_aligned16.h \
+ volk_16sc_deinterleave_16s_aligned16.h \
+ volk_16sc_deinterleave_32f_aligned16.h \
+ volk_16sc_deinterleave_real_16s_aligned16.h \
+ volk_16sc_deinterleave_real_32f_aligned16.h \
+ volk_16sc_deinterleave_real_8s_aligned16.h \
+ volk_16sc_magnitude_16s_aligned16.h \
+ volk_16sc_magnitude_32f_aligned16.h \
+ volk_16s_convert_32f_aligned16.h \
+ volk_16s_convert_32f_unaligned16.h \
+ volk_16s_convert_8s_aligned16.h \
+ volk_16s_convert_8s_unaligned16.h \
+ volk_16s_max_star_aligned16.h \
+ volk_16s_max_star_horizontal_aligned16.h \
+ volk_16s_permute_and_scalar_add_aligned16.h \
+ volk_16s_quad_max_star_aligned16.h \
+ volk_16u_byteswap_aligned16.h \
+ volk_32f_accumulator_aligned16.h \
+ volk_32f_add_aligned16.h \
+ volk_32fc_32f_multiply_aligned16.h \
+ volk_32fc_32f_power_32fc_aligned16.h \
+ volk_32f_calc_spectral_noise_floor_aligned16.h \
+ volk_32fc_atan2_32f_aligned16.h \
+ volk_32fc_conjugate_dot_prod_aligned16.h \
+ volk_32fc_deinterleave_32f_aligned16.h \
+ volk_32fc_deinterleave_64f_aligned16.h \
+ volk_32fc_deinterleave_real_16s_aligned16.h \
+ volk_32fc_deinterleave_real_32f_aligned16.h \
+ volk_32fc_deinterleave_real_64f_aligned16.h \
+ volk_32fc_dot_prod_aligned16.h \
+ volk_32fc_index_max_aligned16.h \
+ volk_32fc_magnitude_16s_aligned16.h \
+ volk_32fc_magnitude_32f_aligned16.h \
+ volk_32fc_multiply_aligned16.h \
+ volk_32f_convert_16s_aligned16.h \
+ volk_32f_convert_16s_unaligned16.h \
+ volk_32f_convert_32s_aligned16.h \
+ volk_32f_convert_32s_unaligned16.h \
+ volk_32f_convert_64f_aligned16.h \
+ volk_32f_convert_64f_unaligned16.h \
+ volk_32f_convert_8s_aligned16.h \
+ volk_32f_convert_8s_unaligned16.h \
+ volk_32fc_power_spectral_density_32f_aligned16.h \
+ volk_32fc_power_spectrum_32f_aligned16.h \
+ volk_32fc_square_dist_aligned16.h \
+ volk_32fc_square_dist_scalar_mult_aligned16.h \
+ volk_32f_divide_aligned16.h \
+ volk_32f_dot_prod_aligned16.h \
+ volk_32f_dot_prod_unaligned16.h \
+ volk_32f_fm_detect_aligned16.h \
+ volk_32f_index_max_aligned16.h \
+ volk_32f_interleave_16sc_aligned16.h \
+ volk_32f_interleave_32fc_aligned16.h \
+ volk_32f_max_aligned16.h \
+ volk_32f_min_aligned16.h \
+ volk_32f_multiply_aligned16.h \
+ volk_32f_normalize_aligned16.h \
+ volk_32f_power_aligned16.h \
+ volk_32f_sqrt_aligned16.h \
+ volk_32f_stddev_aligned16.h \
+ volk_32f_stddev_and_mean_aligned16.h \
+ volk_32f_subtract_aligned16.h \
+ volk_32f_sum_of_poly_aligned16.h \
+ volk_32s_and_aligned16.h \
+ volk_32s_convert_32f_aligned16.h \
+ volk_32s_convert_32f_unaligned16.h \
+ volk_32s_or_aligned16.h \
+ volk_32u_byteswap_aligned16.h \
+ volk_32u_popcnt_aligned16.h \
+ volk_64f_convert_32f_aligned16.h \
+ volk_64f_convert_32f_unaligned16.h \
+ volk_64f_max_aligned16.h \
+ volk_64f_min_aligned16.h \
+ volk_64u_byteswap_aligned16.h \
+ volk_64u_popcnt_aligned16.h \
+ volk_8sc_deinterleave_16s_aligned16.h \
+ volk_8sc_deinterleave_32f_aligned16.h \
+ volk_8sc_deinterleave_real_16s_aligned16.h \
+ volk_8sc_deinterleave_real_32f_aligned16.h \
+ volk_8sc_deinterleave_real_8s_aligned16.h \
+ volk_8sc_multiply_conjugate_16sc_aligned16.h \
+ volk_8sc_multiply_conjugate_32fc_aligned16.h \
+ volk_8s_convert_16s_aligned16.h \
+ volk_8s_convert_16s_unaligned16.h \
+ volk_8s_convert_32f_aligned16.h \
+ volk_8s_convert_32f_unaligned16.h
+
+distclean-local:
+ rm -f volk_config_fixed.h
+ rm -f volk_config.h
+ rm -f volk_cpu.h
+ rm -f volk.h
+ rm -f volk_registry.h
+ rm -f volk_runtime.h
+ rm -f volk_typedefs.h
+ rm -f volk_tables.h
+ rm -f *.pyc
+ rm -f Makefile.in
+ rm -f volk_environment_init.h
\ No newline at end of file
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
new file mode 100644
index 000000000..c9c441008
--- /dev/null
+++ b/volk/include/volk/archs.xml
@@ -0,0 +1,133 @@
+
+
+
+
+ none
+
+
+
+ maltivec
+
+
+
+ 0x80000001
+ m32
+ d
+ 29
+ 0
+
+
+
+ 0x80000001
+ d
+ 29
+ m64
+ 1
+
+
+
+ 0x80000001
+ d
+ 31
+ m3dnow
+ 1
+
+
+
+ 1
+ 0x80000001
+ d
+ 5
+ sse4.2
+
+
+
+ 1
+ 1
+ c
+ 23
+ mpopcnt
+
+
+
+ 1
+ 1
+ d
+ 23
+ mmmx
+
+
+
+
+ 1
+ 1
+ d
+ 25
+ msse
+ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+ xmmintrin.h
+
+
+
+
+ 1
+ 1
+ d
+ 26
+ msse2
+
+
+
+ 1
+ 1
+ c
+ 0
+ msse3
+ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+ pmmintrin.h
+
+
+
+ 1
+ 1
+ c
+ 9
+ mssse3
+
+
+
+ 1
+ 0x80000001
+ c
+ 6
+ msse4a
+
+
+
+
+ 1
+ 1
+ c
+ 19
+ msse4.1
+
+
+
+ 1
+ 1
+ c
+ 20
+ msse4.2
+
+
+
+
+ 1
+ 1
+ c
+ 28
+ mavx
+
+
+
+
diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py
new file mode 100644
index 000000000..309d7e578
--- /dev/null
+++ b/volk/include/volk/emit_omnilog.py
@@ -0,0 +1,13 @@
+def emit_prolog():
+ tempstring = "";
+ tempstring = tempstring + '#ifdef __cplusplus\n';
+ tempstring = tempstring + 'extern "C" {\n';
+ tempstring = tempstring + '#endif\n';
+ return tempstring;
+def emit_epilog():
+ tempstring = "";
+ tempstring = tempstring + '#ifdef __cplusplus\n';
+ tempstring = tempstring + '}\n';
+ tempstring = tempstring + '#endif\n';
+ return tempstring;
+
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
new file mode 100644
index 000000000..08d6d949c
--- /dev/null
+++ b/volk/include/volk/make_c.py
@@ -0,0 +1,76 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ for func in funclist:
+ tempstring = tempstring + "#include\n" ;
+ tempstring = tempstring + '\n';
+
+ tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n";
+ tempstring = tempstring + " int i = 1;\n"
+ tempstring = tempstring + " for(;i\n"
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "\n\n"
+
+ for domarch in dom:
+ if str(domarch.attributes["type"].value) == "all":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 1;\n"
+ tempstring = tempstring + "}\n\n"
+
+ else:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 0;\n"
+ tempstring = tempstring + "}\n\n"
+
+ tempstring = tempstring + "void volk_cpu_init() {\n";
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+ tempstring = tempstring + "}\n\n"
+
+ tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+ tempstring = tempstring + " unsigned int retval = 0;\n"
+ tempstring = tempstring + " volk_cpu_init();\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+ tempstring = tempstring + " return retval;\n"
+ tempstring = tempstring + "}\n\n"
+
+ return tempstring;
diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py
new file mode 100644
index 000000000..823e3b2c0
--- /dev/null
+++ b/volk/include/volk/make_cpuid_h.py
@@ -0,0 +1,25 @@
+from xml.dom import minidom
+from emit_omnilog import *
+
+def make_cpuid_h(dom) :
+ tempstring = "";
+ tempstring = tempstring +'/*this file is auto generated by volk_register.py*/';
+ tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H';
+ tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n';
+ tempstring = tempstring + emit_prolog();
+ tempstring = tempstring + '\n'
+
+ tempstring = tempstring + "struct VOLK_CPU {\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " int (*has_" + arch + ") ();\n";
+ tempstring = tempstring + "}volk_cpu;\n\n";
+
+ tempstring = tempstring + "void volk_cpu_init ();\n"
+ tempstring = tempstring + "unsigned int volk_get_lvarch ();\n"
+
+ tempstring = tempstring + "\n";
+ tempstring = tempstring + emit_epilog();
+ tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n"
+
+ return tempstring;
diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py
new file mode 100644
index 000000000..443a58488
--- /dev/null
+++ b/volk/include/volk/make_cpuid_powerpc_c.py
@@ -0,0 +1,45 @@
+from xml.dom import minidom
+
+def make_cpuid_powerpc_c(dom) :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "\n\n"
+
+ #just assume it has them for powerpc
+ for domarch in dom:
+ if str(domarch.attributes["type"].value) == "powerpc":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 1;\n"
+ tempstring = tempstring + "}\n\n"
+ elif str(domarch.attributes["type"].value) == "all":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 1;\n"
+ tempstring = tempstring + "}\n\n"
+ else:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 0;\n"
+ tempstring = tempstring + "}\n\n"
+
+
+ tempstring = tempstring + "void volk_cpu_init() {\n";
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+
+ tempstring = tempstring + "}\n\n"
+ tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+ tempstring = tempstring + " unsigned int retval = 0;\n"
+ tempstring = tempstring + " volk_cpu_init();\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+ tempstring = tempstring + " return retval;\n"
+ tempstring = tempstring + "}\n\n"
+
+ return tempstring;
+
diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py
new file mode 100644
index 000000000..fd83e5e5c
--- /dev/null
+++ b/volk/include/volk/make_cpuid_x86_c.py
@@ -0,0 +1,93 @@
+from xml.dom import minidom
+
+def make_cpuid_x86_c(dom) :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "\n\n"
+ tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n"
+ tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n";
+ tempstring = tempstring + " unsigned int regs[4];\n"
+ tempstring = tempstring + " cpuid_x86 (op, regs);\n"
+ tempstring = tempstring + " return regs[0];\n"
+ tempstring = tempstring + "}\n\n";
+
+ tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n";
+ tempstring = tempstring + " unsigned int regs[4];\n"
+ tempstring = tempstring + " cpuid_x86 (op, regs);\n"
+ tempstring = tempstring + " return regs[1];\n"
+ tempstring = tempstring + "}\n\n";
+
+ tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n";
+ tempstring = tempstring + " unsigned int regs[4];\n"
+ tempstring = tempstring + " cpuid_x86 (op, regs);\n"
+ tempstring = tempstring + " return regs[2];\n"
+ tempstring = tempstring + "}\n\n";
+
+ tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n";
+ tempstring = tempstring + " unsigned int regs[4];\n"
+ tempstring = tempstring + " cpuid_x86 (op, regs);\n"
+ tempstring = tempstring + " return regs[3];\n"
+ tempstring = tempstring + "}\n\n";
+
+ for domarch in dom:
+ if str(domarch.attributes["type"].value) == "x86":
+ arch = str(domarch.attributes["name"].value);
+ op = domarch.getElementsByTagName("op");
+ op = str(op[0].firstChild.data);
+ reg = domarch.getElementsByTagName("reg");
+ reg = str(reg[0].firstChild.data);
+ shift = domarch.getElementsByTagName("shift");
+ shift = str(shift[0].firstChild.data);
+ val = domarch.getElementsByTagName("val");
+ val = str(val[0].firstChild.data);
+
+ if op == "1":
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n"
+ tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n"
+ tempstring = tempstring + "}\n\n";
+
+ elif op == "0x80000001":
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " unsigned int extended_fct_count = cpuid_eax(0x80000000);\n";
+ tempstring = tempstring + " if (extended_fct_count < 0x80000001)\n";
+ tempstring = tempstring + " return "+ val + "^1;\n\n"
+ tempstring = tempstring + " unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n";
+ tempstring = tempstring + " return ((extended_features >> " + shift + ") & 1) == " + val + ";\n"
+ tempstring = tempstring + "}\n\n";
+ elif str(domarch.attributes["type"].value) == "all":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 1;\n"
+ tempstring = tempstring + "}\n\n"
+ else:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+ tempstring = tempstring + " return 0;\n"
+ tempstring = tempstring + "}\n\n"
+
+ tempstring = tempstring + "void volk_cpu_init() {\n";
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+ tempstring = tempstring + "}\n\n"
+
+ tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+ tempstring = tempstring + " unsigned int retval = 0;\n"
+ tempstring = tempstring + " volk_cpu_init();\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+ tempstring = tempstring + " return retval;\n"
+ tempstring = tempstring + "}\n\n"
+
+ return tempstring;
+
+
+
+
+
+
+
diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py
new file mode 100644
index 000000000..e06c7f246
--- /dev/null
+++ b/volk/include/volk/make_environment_init_c.py
@@ -0,0 +1,33 @@
+from xml.dom import minidom
+
+def make_environment_init_c(dom) :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring + "#include\n"
+ tempstring = tempstring + "#include\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ incs = domarch.getElementsByTagName("include");
+ for inc in incs:
+ my_inc = str(inc.firstChild.data);
+ tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+ tempstring = tempstring + "#include<" + my_inc + ">\n";
+ tempstring = tempstring + "#endif\n"
+ tempstring = tempstring + '\n\n';
+ tempstring = tempstring + "void volk_environment_init(){\n"
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ envs = domarch.getElementsByTagName("environment");
+ for env in envs:
+ cmd = str(env.firstChild.data);
+ tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+ tempstring = tempstring + " " + cmd + "\n";
+ tempstring = tempstring + "#endif\n"
+
+ tempstring = tempstring + "}\n";
+ return tempstring;
+
+
+
+
diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py
new file mode 100644
index 000000000..77a841a24
--- /dev/null
+++ b/volk/include/volk/make_environment_init_h.py
@@ -0,0 +1,18 @@
+from xml.dom import minidom
+from emit_omnilog import *
+
+def make_environment_init_h() :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n";
+ tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n";
+ tempstring = tempstring + "\n";
+ tempstring = tempstring + emit_prolog();
+ tempstring = tempstring + "void volk_environment_init();\n";
+ tempstring = tempstring + emit_epilog();
+ tempstring = tempstring + "#endif\n"
+ return tempstring;
+
+
+
+
diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py
new file mode 100644
index 000000000..81d9ad401
--- /dev/null
+++ b/volk/include/volk/make_h.py
@@ -0,0 +1,28 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_h(funclist, arched_arglist, retlist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+ tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H';
+ tempstring = tempstring + '\n#define INCLUDED_VOLK_H';
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n'
+ tempstring = tempstring + emit_prolog()
+ tempstring = tempstring + '\n';
+
+ for i in range(len(retlist)):
+ tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n';
+ tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n';
+
+ tempstring = tempstring + emit_epilog();
+
+ tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n";
+
+ return tempstring;
diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py
new file mode 100644
index 000000000..330e19592
--- /dev/null
+++ b/volk/include/volk/make_init_c.py
@@ -0,0 +1,42 @@
+from xml.dom import minidom
+
+def make_init_c(funclist, dom) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ incs = domarch.getElementsByTagName("include");
+ for inc in incs:
+ my_inc = str(inc.firstChild.data);
+ tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+ tempstring = tempstring + "#include<" + my_inc + ">\n";
+ tempstring = tempstring + "#endif\n"
+ tempstring = tempstring + '\n\n';
+
+ tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n";
+ tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n";
+ tempstring = tempstring + " return &volk_runtime;\n";
+ tempstring = tempstring + "}\n\n"
+ tempstring = tempstring + " void volk_runtime_init() {\nvolk_cpu_init();\n";
+
+ for func in funclist:
+ tempstring = tempstring + " volk_runtime." + func + " = default_acquire_" + func + ";\n";
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ envs = domarch.getElementsByTagName("environment");
+ for env in envs:
+ cmd = str(env.firstChild.data);
+ tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n";
+ tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+ tempstring = tempstring + " " + cmd + "\n";
+ tempstring = tempstring + "#endif\n"
+ tempstring = tempstring + " }\n";
+
+ tempstring = tempstring + "}\n";
+
+ return tempstring
diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py
new file mode 100644
index 000000000..6dbe1c585
--- /dev/null
+++ b/volk/include/volk/make_init_h.py
@@ -0,0 +1,26 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_init_h(funclist, arched_arglist, retlist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+ tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H';
+ tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H';
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+
+ tempstring = tempstring + '\n';
+
+ tempstring = tempstring + emit_prolog();
+
+ for i in range(len(retlist)):
+ tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n';
+
+ tempstring= tempstring + emit_epilog();
+ tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n";
+
+ return tempstring;
diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py
new file mode 100644
index 000000000..051ac268d
--- /dev/null
+++ b/volk/include/volk/make_mktables.py
@@ -0,0 +1,33 @@
+
+
+def make_mktables(funclist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n';
+
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstrgin = tempstring + '#include\n';
+ tempstring = tempstring + "\n\n";
+
+ tempstring = tempstring + 'int main() {\n';
+ tempstring = tempstring + ' int i = 0;\n';
+ tempstring = tempstring + ' FILE* output;\n';
+ tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n';
+ tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n';
+ tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n';
+
+ for func in funclist:
+ tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n';
+ tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n';
+ tempstring = tempstring + ' fclose(output);\n'
+ tempstring = tempstring + '}\n';
+ return tempstring;
+
+
+
+
+
+
+
+
diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py
new file mode 100644
index 000000000..c75a4d5fb
--- /dev/null
+++ b/volk/include/volk/make_proccpu_sim.py
@@ -0,0 +1,25 @@
+from xml.dom import minidom
+
+def make_proccpu_sim(dom) :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "#include \n"
+ tempstring = tempstring + "\n\n"
+
+ tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n";
+ tempstring = tempstring + " if(val==1){\n";
+ tempstring = tempstring + " sprintf(buf, \"%s %s\", buf, newkey);\n";
+ tempstring = tempstring + " }\n";
+ tempstring = tempstring + "}\n";
+ tempstring = tempstring + "\n\n";
+
+ tempstring = tempstring + "int main() {\n";
+ tempstring = tempstring + " volk_cpu_init();\n";
+ tempstring = tempstring + " char buf[2048];\n";
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n"
+ tempstring = tempstring + " printf(\"%s\\n\", buf);\n"
+ tempstring = tempstring + "}\n"
+ return tempstring;
diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py
new file mode 100644
index 000000000..8457d61f3
--- /dev/null
+++ b/volk/include/volk/make_registry.py
@@ -0,0 +1,62 @@
+from xml.dom import minidom
+from emit_omnilog import *
+import string
+
+def make_registry(dom, funclist, fcountlist) :
+ tempstring = "";
+ tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+ tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H';
+ tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n';
+ tempstring = tempstring +'#include\n';
+ tempstring = tempstring +'#include\n';
+ tempstring = tempstring + emit_prolog();
+ tempstring = tempstring + '\n'
+
+
+
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n";
+ tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n";
+ tempstring = tempstring +"#else\n";
+ tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n";
+ tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n";
+
+ counter = 0;
+ for fcount in fcountlist:
+ tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n";
+ counter = counter + 1;
+ for arch_list in fcount:
+ tempstring = tempstring + " (LV_"
+ for ind in range(len(arch_list)):
+ tempstring = tempstring + arch_list[ind] + "_CNT";
+ if ind < len(arch_list) - 1:
+ tempstring = tempstring + " * LV_";
+ tempstring = tempstring + ") + ";
+ lindex = tempstring.rfind(" + ");
+ tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", "");
+ tempstring = tempstring + ",\n"
+ for arch_list in fcount:
+ tempstring = tempstring + "#if LV_HAVE_"
+ for ind in range(len(arch_list)):
+ tempstring = tempstring + arch_list[ind];
+ if ind < len(arch_list) - 1:
+ tempstring = tempstring + " && LV_HAVE_";
+ tempstring = tempstring + "\n"
+ tempstring = tempstring + " (1 << LV_"
+ for ind in range(len(arch_list)):
+ tempstring = tempstring + arch_list[ind];
+ if ind < len(arch_list) - 1:
+ tempstring = tempstring + ") + (1 << LV_"
+ tempstring = tempstring + "),\n#endif\n"
+ lindex = tempstring.rfind(",");
+ tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+ tempstring = tempstring + "};\n\n"
+
+
+ tempstring = tempstring + emit_epilog();
+ tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n";
+
+ return tempstring;
+
diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py
new file mode 100644
index 000000000..645b3aaee
--- /dev/null
+++ b/volk/include/volk/make_runtime.py
@@ -0,0 +1,34 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_runtime(funclist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n';
+
+ tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME';
+ tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME';
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + emit_prolog();
+
+ tempstring = tempstring + '\n';
+
+ tempstring = tempstring + "struct VOLK_RUNTIME {\n";
+
+ for i in range(len(funclist)):
+ tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n";
+ tempstring = tempstring + "};\n\n";
+
+ tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n"
+ tempstring = tempstring + "\nvoid volk_runtime_init();\n";
+
+ tempstring = tempstring + emit_epilog();
+ tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n";
+
+ return tempstring;
+
diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py
new file mode 100644
index 000000000..070df9ba7
--- /dev/null
+++ b/volk/include/volk/make_runtime_c.py
@@ -0,0 +1,47 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + "#include\n";
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+ tempstring = tempstring + '#include\n';
+
+ for func in funclist:
+ tempstring = tempstring + "#include\n" ;
+ tempstring = tempstring + '\n';
+
+ tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n";
+
+ for i in range(len(funclist)):
+ tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
+
+ tags_counter = 0;
+ for arch_list in fcountlist[i]:
+ tempstring = tempstring + "#if LV_HAVE_"
+ for ind in range(len(arch_list)):
+
+ tempstring = tempstring + arch_list[ind];
+ if ind < len(arch_list) - 1:
+ tempstring = tempstring + " && LV_HAVE_";
+
+ tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n";
+ tags_counter = tags_counter + 1;
+
+ lindex = tempstring.rfind(",");
+ tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+ tempstring = tempstring + "};\n\n";
+
+
+ tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n';
+ tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n';
+
+ return tempstring;
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
new file mode 100644
index 000000000..e320bc748
--- /dev/null
+++ b/volk/include/volk/make_set_simd.py
@@ -0,0 +1,202 @@
+from xml.dom import minidom
+
+def make_set_simd(dom) :
+ tempstring = "";
+ tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n';
+
+ tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n";
+ tempstring = tempstring + "[\n";
+ tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n";
+ tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n";
+ tempstring = tempstring + " case \"$MD_CPU\" in\n";
+ tempstring = tempstring + " (x86)\n";
+ tempstring = tempstring + " case \"$MD_SUBCPU\" in\n";
+ tempstring = tempstring + " (x86)\n";
+ tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n";
+ tempstring = tempstring + " then\n";
+ tempstring = tempstring + " AC_MSG_RESULT(yes)\n";
+ tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n";
+ tempstring = tempstring + " rm -f proccpu\n";
+ tempstring = tempstring + " else\n";
+ tempstring = tempstring + " AC_MSG_RESULT(no)\n";
+ tempstring = tempstring + " lv_PROCCPU=no\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " ;;\n"
+ tempstring = tempstring + " (*)\n"
+ tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n";
+ tempstring = tempstring + " then\n";
+ tempstring = tempstring + " AC_MSG_RESULT(yes)\n";
+ tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n";
+ tempstring = tempstring + " rm -f proccpu\n";
+ tempstring = tempstring + " else\n";
+ tempstring = tempstring + " AC_MSG_RESULT(no)\n";
+ tempstring = tempstring + " lv_PROCCPU=no\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " ;;\n"
+ tempstring = tempstring + " esac\n"
+ tempstring = tempstring + " ;;\n";
+ tempstring = tempstring + " (powerpc)\n";
+ tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n";
+ tempstring = tempstring + " then\n";
+ tempstring = tempstring + " AC_MSG_RESULT(yes)\n";
+ tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n";
+ tempstring = tempstring + " rm -f proccpu\n";
+ tempstring = tempstring + " else\n";
+ tempstring = tempstring + " AC_MSG_RESULT(no)\n";
+ tempstring = tempstring + " lv_PROCCPU=no\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " ;;\n";
+ tempstring = tempstring + " (*)\n";
+ tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n";
+ tempstring = tempstring + " then\n";
+ tempstring = tempstring + " AC_MSG_RESULT(yes)\n";
+ tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n";
+ tempstring = tempstring + " rm -f proccpu\n";
+ tempstring = tempstring + " else\n";
+ tempstring = tempstring + " AC_MSG_RESULT(no)\n";
+ tempstring = tempstring + " lv_PROCCPU=no\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " ;;\n";
+ tempstring = tempstring + " esac\n";
+ tempstring = tempstring + "])\n"
+
+ for domarch in dom:
+ if str(domarch.attributes["type"].value) != "all":
+ arch = str(domarch.attributes["name"].value);
+ flag = domarch.getElementsByTagName("flag");
+ flag = str(flag[0].firstChild.data);
+ tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n";
+ tempstring = tempstring + "[\n";
+ tempstring = tempstring + " LF_CHECK_CC_FLAG([-" + flag + "])\n";
+ tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n";
+ tempstring = tempstring + "])\n";
+
+ tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n";
+ tempstring = tempstring + "[\n";
+ tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n";
+ tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n";
+ tempstring = tempstring + " indCC=no\n";
+ tempstring = tempstring + " indCXX=no\n";
+ tempstring = tempstring + " indLV_ARCH=no\n";
+ tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n";
+ tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list(";
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + arch + ", "
+ tempstring = tempstring[0:len(tempstring) - 2];
+
+ tempstring = tempstring + ")]),\n";
+ tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n";
+ tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n";
+ if str(domarch.attributes["type"].value) == "all":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
+ tempstring = tempstring + " ADDONS=\"\"\n";
+ tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n";
+ tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n";
+ tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n";
+
+ tempstring = tempstring + " fi\n";
+ tempstring = tempstring + " echo $cf_with_lv_arch\n";
+ for domarch in dom:
+ if str(domarch.attributes["type"].value) != "all":
+ arch = str(domarch.attributes["name"].value);
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n";
+
+ tempstring = tempstring + " case \"$MD_CPU\" in\n";
+ tempstring = tempstring + " (x86)\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ atype = str(domarch.attributes["type"].value);
+ if atype == "x86":
+ tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n";
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ atype = str(domarch.attributes["type"].value);
+ flag = domarch.getElementsByTagName("flag");
+ flag = str(flag[0].firstChild.data);
+ if atype == "x86":
+ tempstring = tempstring + " for i in $lf_CXXFLAGS\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n";
+ tempstring = tempstring + " indCXX=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " for i in $lf_CFLAGS\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n";
+ tempstring = tempstring + " indCC=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " for i in $cf_with_lv_arch\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n";
+ tempstring = tempstring + " indLV_ARCH=yes\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " indCC=no\n"
+ tempstring = tempstring + " indCXX=no\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ elif atype == "all":
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " ;;\n"
+
+ tempstring = tempstring + " (powerpc)\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ atype = str(domarch.attributes["type"].value);
+ if atype == "powerpc":
+ tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n";
+
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ atype = str(domarch.attributes["type"].value);
+ flag = domarch.getElementsByTagName("flag");
+ flag = str(flag[0].firstChild.data);
+ if atype == "powerpc":
+ tempstring = tempstring + " for i in $lf_CXXFLAGS\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n";
+ tempstring = tempstring + " indCXX=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " for i in $lf_CFLAGS\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n";
+ tempstring = tempstring + " indCC=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " for i in $cf_with_lv_arch\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n";
+ tempstring = tempstring + " indLV_ARCH=yes\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n"
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " indCC=no\n"
+ tempstring = tempstring + " indCXX=no\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ elif atype == "all":
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " ;;\n"
+ tempstring = tempstring + " esac\n"
+ tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
+ tempstring = tempstring + "])\n"
+
+ return tempstring;
+
+
+
diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py
new file mode 100644
index 000000000..fe81cb2b0
--- /dev/null
+++ b/volk/include/volk/make_typedefs.py
@@ -0,0 +1,23 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+
+def make_typedefs(funclist, retlist, my_argtypelist) :
+ tempstring = "";
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+ tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+ tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS';
+ tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n';
+ tempstring = tempstring + '\n\n#include\n';
+ tempstring = tempstring + '#include\n';
+
+ tempstring = tempstring + '\n';
+
+ for i in range(len(funclist)):
+ tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n";
+
+ tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n";
+
+ return tempstring;
diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..63042bef1
--- /dev/null
+++ b/volk/include/volk/volk_16s_add_quad_aligned16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
+
+
+#include
+#include
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include
+#include
+
+static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*asm volatile
+ (
+ ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t"
+ ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+
+ */
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..fb9d7cb87
--- /dev/null
+++ b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
+
+
+#include
+#include
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = 1;
+
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm5 = _mm_load_si128(p_src0);
+
+
+
+
+
+
+
+
+
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+ p_src0 += 4;
+
+
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
+
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
+
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+ _mm_store_si128(&p_target[2], xmm8);
+
+ _mm_store_si128(&p_target[3], xmm10);
+
+ p_target += 3;
+ }
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+ int i = 0;
+
+ int bound = 4;
+
+ for(; i < bound; ++i) {
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8] & scalars[2])
+ + (cntl3[i * 8] & scalars[3]);
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 1] & scalars[2])
+ + (cntl3[i * 8 + 1] & scalars[3]);
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 2] & scalars[2])
+ + (cntl3[i * 8 + 2] & scalars[3]);
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 3] & scalars[2])
+ + (cntl3[i * 8 + 3] & scalars[3]);
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 4] & scalars[2])
+ + (cntl3[i * 8 + 4] & scalars[3]);
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 5] & scalars[2])
+ + (cntl3[i * 8 + 5] & scalars[3]);
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 6] & scalars[2])
+ + (cntl3[i * 8 + 6] & scalars[3]);
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 7] & scalars[2])
+ + (cntl3[i * 8 + 7] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h
new file mode 100644
index 000000000..126ce1528
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_32f_aligned16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..d6212fba5
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h
new file mode 100644
index 000000000..64c368688
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
new file mode 100644
index 000000000..ca925de86
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h
new file mode 100644
index 000000000..ba4e979ec
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_aligned16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
+#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
+
+
+#include
+#include
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+
+ __m128i *p_src0;
+
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+
+ for(i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+
+
+
+
+
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+
+
+ }
+
+ _mm_store_si128((__m128i*)cands, xmm0);
+
+ for(i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+ }
+
+ target[0] = candidate;
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short candidate = src0[0];
+ for(i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
+
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
new file mode 100644
index 000000000..82d011677
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
+#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
+
+
+#include
+#include
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+ const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+
+
+ volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
+
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+ __m128i *p_target, *p_src0;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ for(i = 0; i < bound; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
+
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+ _mm_store_si128(p_target, xmm3);
+
+ p_target += 1;
+
+ }
+
+ for(i = 0; i < intermediate; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+
+ _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+
+ }
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+ target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+
+ for(i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+ }
+
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..452d05c4f
--- /dev/null
+++ b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+
+#include
+#include
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include
+#include
+
+static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+ short* p_permute_indexes = permute_indexes;
+
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
+
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_permute_indexes += 8;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
+
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ xmm5 = _mm_load_si128(p_cntl3);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_cntl0 += 1;
+
+ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ p_cntl1 += 1;
+ p_cntl2 += 1;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ p_cntl3 += 1;
+
+ _mm_store_si128(p_target, xmm0);
+
+ p_target += 1;
+ }
+
+
+
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..1004c4d23
--- /dev/null
+++ b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
+
+
+#include
+#include
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include
+
+static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+
+
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*) target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+
+
+
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ while(bound_copy > 0) {
+
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
+
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
+
+
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+
+
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+
+
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
+
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
+
+ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
+
+
+
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
+
+
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+
+ }
+
+
+ /*asm volatile
+ (
+ "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16s_quad_max_star_aligned16_sse2_END\n\t"
+
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
+
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
+
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
+
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
+
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
+
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
+
+ "pand %%xmm1, %%xmm6\n\t"
+
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
+
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
+
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t"
+
+ "volk_16s_quad_max_star_aligned16_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+ return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..32e13df98
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,146 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..86f67437d
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..b594c85b8
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..3e7be1e64
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+ for(;number < quarterPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar = 1.0/scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..c0d1e941a
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+ int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..1482ab82e
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE3
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ float inputFloatBuffer[4] __attribute__((aligned(128)));
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = 32786.0;
+ for(number = 0; number < num_points; number++){
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..29c58ceb8
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -0,0 +1,163 @@
+#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE3
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ float real = ( (float) (*complexVectorPtr++)) * invScalar;
+ float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
new file mode 100644
index 000000000..698e958e4
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -0,0 +1,65 @@
+#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
+#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h
new file mode 100644
index 000000000..7e395cf50
--- /dev/null
+++ b/volk/include/volk/volk_32f_accumulator_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
+#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ float tempBuffer[4] __attribute__((aligned(128)));
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
new file mode 100644
index 000000000..721c60fd6
--- /dev/null
+++ b/volk/include/volk/volk_32f_add_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H
+#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
new file mode 100644
index 000000000..ff917525f
--- /dev/null
+++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
@@ -0,0 +1,167 @@
+#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ float avgPointsVector[4] __attribute__((aligned(128)));
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaning count
+ float validBinCountVector[4] __attribute__((aligned(128)));
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(realDataPoints[number] <= meanAmplitude){
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if(validBinCount > 0.0){
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ }
+ else{
+ localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for(number = 0; number < num_points; number++){
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for(number = 0; number < num_points; number++){
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
+
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h
new file mode 100644
index 000000000..7fbabd9c3
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_16s_aligned16.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+ intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
new file mode 100644
index 000000000..d2bbdf13a
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
@@ -0,0 +1,113 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+ intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h
new file mode 100644
index 000000000..011ef5d0e
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_32s_aligned16.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int32_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int32_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
new file mode 100644
index 000000000..a6df826c7
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int32_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int32_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h
new file mode 100644
index 000000000..91a855813
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_aligned16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
new file mode 100644
index 000000000..698e0d446
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h
new file mode 100644
index 000000000..b9487b622
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_8s_aligned16.h
@@ -0,0 +1,117 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+ intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+ intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+ intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int8_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int8_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
new file mode 100644
index 000000000..e986dbc87
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+ intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+ intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+ intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int8_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_mul_ps(ret, vScalar);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (int8_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
new file mode 100644
index 000000000..c00700cd8
--- /dev/null
+++ b/volk/include/volk/volk_32f_divide_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
+#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_div_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h
new file mode 100644
index 000000000..3aee1136a
--- /dev/null
+++ b/volk/include/volk/volk_32f_dot_prod_aligned16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
+
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_SSE3
+
+#include
+
+static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include
+
+static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
new file mode 100644
index 000000000..bce6aa15f
--- /dev/null
+++ b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
+
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_SSE3
+
+#include
+
+static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include
+
+static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..c82239d74
--- /dev/null
+++ b/volk/include/volk/volk_32f_fm_detect_aligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_noints The number of real values in the input vector.
+*/
+static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int quarterPoints = (num_points-1) / 4;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m128 upperBound = _mm_set_ps1(bound);
+ __m128 lowerBound = _mm_set_ps1(-bound);
+ __m128 next3old1;
+ __m128 next4;
+ __m128 boundAdjust;
+ __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+ __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+ // Do the first 4 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < quarterPoints; number++) {
+ // Load data
+ next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+ next4 = _mm_load_ps(inPtr);
+ inPtr += 4;
+ // Subtract and store:
+ next3old1 = _mm_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm_cmplt_ps(next3old1, lowerBound);
+ next4 = _mm_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm_add_ps(next3old1, boundAdjust);
+ _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+ outPtr += 4;
+ }
+
+ for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_points The number of real values in the input vector.
+*/
+static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 0;
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+
+ // Do the first 1 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+
+ for (number = 1; number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h
new file mode 100644
index 000000000..26322bfa2
--- /dev/null
+++ b/volk/include/volk/volk_32f_index_max_aligned16.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ float maxValuesBuffer[4] __attribute__((aligned(16)));
+ float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#if LV_HAVE_SSE
+#include
+
+static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ float maxValuesBuffer[4] __attribute__((aligned(16)));
+ float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ float max = src0[0];
+ unsigned int index = 0;
+
+ int i = 1;
+
+ for(; i < num_points; ++i) {
+
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
new file mode 100644
index 000000000..476946b88
--- /dev/null
+++ b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
+#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
new file mode 100644
index 000000000..859c6a0ef
--- /dev/null
+++ b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
+#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h
new file mode 100644
index 000000000..96aafb2bf
--- /dev/null
+++ b/volk/include/volk/volk_32f_max_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_max_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h
new file mode 100644
index 000000000..e247f4213
--- /dev/null
+++ b/volk/include/volk/volk_32f_min_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_min_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h
new file mode 100644
index 000000000..b557580ab
--- /dev/null
+++ b/volk/include/volk/volk_32f_multiply_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
new file mode 100644
index 000000000..1aabb1d9d
--- /dev/null
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
+#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
+ \param vecBuffer The buffer of values to be vectorized
+ \param num_points The number of values in vecBuffer
+ \param scalar The scale value to be applied to each buffer value
+*/
+static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m128 vecScalar = _mm_set_ps1(invScalar);
+
+ __m128 input1;
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+
+ input1 = _mm_load_ps(inputPtr);
+
+ input1 = _mm_mul_ps(input1, vecScalar);
+
+ _mm_store_ps(inputPtr, input1);
+
+ inputPtr += 4;
+ }
+
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h
new file mode 100644
index 000000000..2ecd8eecb
--- /dev/null
+++ b/volk/include/volk/volk_32f_power_aligned16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H
+#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+#if LV_HAVE_LIB_SIMDMATH
+#include
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+
+#if LV_HAVE_LIB_SIMDMATH
+#include
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h
new file mode 100644
index 000000000..0b2eaf251
--- /dev/null
+++ b/volk/include/volk/volk_32f_sqrt_aligned16.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_sqrt_ps(aVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h
new file mode 100644
index 000000000..1c6a08437
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_aligned16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
+#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+
+ float squareBuffer[4] __attribute__((aligned(128)));
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+
+ float squareBuffer[4] __attribute__((aligned(128)));
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
new file mode 100644
index 000000000..1cd502257
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+ float meanBuffer[4] __attribute__((aligned(128)));
+ float squareBuffer[4] __attribute__((aligned(128)));
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ float meanBuffer[4] __attribute__((aligned(128)));
+ float squareBuffer[4] __attribute__((aligned(128)));
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrt(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h
new file mode 100644
index 000000000..ac3f5e5d1
--- /dev/null
+++ b/volk/include/volk/volk_32f_subtract_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
new file mode 100644
index 000000000..a326e62b1
--- /dev/null
+++ b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
+#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
+
+#include
+#include
+#include
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include
+#include
+
+static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+
+ xmm0 = _mm_load1_ps(¢er_point_array[0]);
+ xmm6 = _mm_load1_ps(¢er_point_array[1]);
+ xmm7 = _mm_load1_ps(¢er_point_array[2]);
+ xmm8 = _mm_load1_ps(¢er_point_array[3]);
+ //xmm11 = _mm_load1_ps(¢er_point_array[4]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 2) & 3;
+ int i = 0;
+
+ for(; i < bound; ++i) {
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+ //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+ //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+
+ //xmm9 = _mm_add_ps(xmm12, xmm9);
+ }
+
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+ _mm_store_ss(&result, xmm4);
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
+ //center_point_array[4] * fith);
+ }
+
+ result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+ target[0] = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ int i = 0;
+
+ for(; i < num_bytes >> 2; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ //center_point_array[4] * fith);
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth) +
+ //center_point_array[4] * fith) +
+ (center_point_array[4]), i);
+ */
+ }
+
+ result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+
+
+ *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
new file mode 100644
index 000000000..436656ca0
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal1 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ aVal2 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ bVal = _mm_load_ps(bPtr);
+ bPtr += 4;
+
+ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+ cVal = _mm_mul_ps(aVal1, bVal1);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+ cPtr += 2;
+
+ cVal = _mm_mul_ps(aVal2, bVal2);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr);
+ bPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
new file mode 100644
index 000000000..2d71ee4f8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+
+#if LV_HAVE_LIB_SIMDMATH
+#include
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+ __m128 vPower = _mm_set_ps1(power);
+
+ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ cplxValue2 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ // Convert to polar coordinates
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+ magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+
+ // Now calculate the power of the polar coordinate data
+ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+ // Convert back to cartesian coordinates
+ iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+ qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+ _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+
+ cPtr += 2;
+
+ _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ lv_32fc_t complexPower;
+ ((float*)&complexPower)[0] = power;
+ ((float*)&complexPower)[1] = 0;
+ for(;number < num_points; number++){
+ *cPtr++ = lv_cpow((*aPtr++), complexPower);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+ lv_32fc_t complexPower;
+ ((float*)&complexPower)[0] = power;
+ ((float*)&complexPower)[1] = 0.0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_cpow((*aPtr++), complexPower);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
new file mode 100644
index 000000000..df0ebb987
--- /dev/null
+++ b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+#if LV_HAVE_LIB_SIMDMATH
+#include
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_blendv_ps(correctVector, phase, keepMask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include
+
+#if LV_HAVE_LIB_SIMDMATH
+#include
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 mask;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_and_ps(phase, keepMask);
+ mask = _mm_andnot_ps(keepMask, correctVector);
+ phase = _mm_or_ps(phase, mask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The vector where the results will be stored.
+ \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+ float* outPtr = outputVector;
+ const float* inPtr = (float*)inputVector;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number;
+ for ( number = 0; number < num_points; number++) {
+ const float real = *inPtr++;
+ const float imag = *inPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
new file mode 100644
index 000000000..60103c1b5
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
@@ -0,0 +1,344 @@
+#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
+
+#include
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+
+
+
+ asm volatile
+ (
+ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %[conjugator], %%r9\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movaps 0(%%r9), %%xmm8\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " xorps %%xmm8, %%xmm3\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ int getem = num_bytes % 16;
+
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+
+ }
+
+ return;
+}
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ int bound = num_bytes >> 4;
+ int leftovers = num_bytes % 16;
+
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " #movl 12(%%ebp), %%eax # input\n\t"
+ " #movl 16(%%ebp), %%edx # taps\n\t"
+ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%[eax]), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%[edx]), %%xmm2\n\t"
+ " movl %[ecx], (%[out])\n\t"
+ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+ " xorps %%xmm1, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%[eax]), %%xmmA\n\t"
+ "# movaps (%[edx]), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%[edx]), %%xmm3\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " xorps %%xmm1, %%xmm3\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " movaps 16(%[eax]), %%xmm1\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%[eax]), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %[eax]\n\t"
+ " movaps 32(%[edx]), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " xorps %%xmm1, %%xmm2\n\t"
+ " addl $32, %[edx]\n\t"
+ ".%=L1_test:\n\t"
+ " decl %[ecx]\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %[ecx]\n\t"
+ " andl $1, %[ecx]\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " #movl 8(%%ebp), %[eax] \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%[out])\n\t"
+ " movss (%[out]), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %[eax] # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+ );
+
+
+
+
+ printf("%d, %d\n", leftovers, bound);
+
+ for(; leftovers > 0; leftovers -= 8) {
+
+
+ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+
+ }
+
+ return;
+
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+
+#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..02085cd1e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
new file mode 100644
index 000000000..3d9ebccdd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..3026b2422
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue;
+
+ float floatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ iValue = _mm_mul_ps(iValue, vScalar);
+
+ _mm_store_ps(floatBuffer, iValue);
+ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ iBufferPtr = &iBuffer[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..2af973bcc
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
new file mode 100644
index 000000000..f408589c4
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h
new file mode 100644
index 000000000..1a834dc25
--- /dev/null
+++ b/volk/include/volk/volk_32fc_dot_prod_aligned16.h
@@ -0,0 +1,468 @@
+#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
+
+#include
+#include
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+
+ }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ int getem = num_bytes % 16;
+
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+
+ }
+
+ return;
+
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " movl 12(%%ebp), %%eax # input\n\t"
+ " movl 16(%%ebp), %%edx # taps\n\t"
+ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%eax), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%edx), %%xmm2\n\t"
+ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%eax), %%xmmA\n\t"
+ "# movaps (%%edx), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%eax), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%edx), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%eax), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %%eax\n\t"
+ " movaps 32(%%edx), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " addl $32, %%edx\n\t"
+ ".%=L1_test:\n\t"
+ " decl %%ecx\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %%ecx\n\t"
+ " andl $1, %%ecx\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " movl 8(%%ebp), %%eax \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%%eax)\n\t"
+ " movss (%%eax), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %%eax # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ :
+ : "eax", "ecx", "edx"
+ );
+
+
+ int getem = num_bytes % 16;
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+
+ }
+
+ return;
+
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_SSE3
+
+#include