summaryrefslogtreecommitdiff
path: root/volk/lib
diff options
context:
space:
mode:
Diffstat (limited to 'volk/lib')
-rw-r--r--volk/lib/Makefile.am361
-rw-r--r--volk/lib/assembly.h67
-rw-r--r--volk/lib/cpuid_x86.S60
-rw-r--r--volk/lib/cpuid_x86_64.S54
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.cc89
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.h18
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.cc106
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.h18
-rw-r--r--volk/lib/qa_16s_convert_32f_aligned16.cc73
-rw-r--r--volk/lib/qa_16s_convert_32f_aligned16.h18
-rw-r--r--volk/lib/qa_16s_convert_32f_unaligned16.cc73
-rw-r--r--volk/lib/qa_16s_convert_32f_unaligned16.h18
-rw-r--r--volk/lib/qa_16s_convert_8s_aligned16.cc60
-rw-r--r--volk/lib/qa_16s_convert_8s_aligned16.h18
-rw-r--r--volk/lib/qa_16s_convert_8s_unaligned16.cc60
-rw-r--r--volk/lib/qa_16s_convert_8s_unaligned16.h18
-rw-r--r--volk/lib/qa_16s_max_star_aligned16.cc65
-rw-r--r--volk/lib/qa_16s_max_star_aligned16.h18
-rw-r--r--volk/lib/qa_16s_max_star_horizontal_aligned16.cc79
-rw-r--r--volk/lib/qa_16s_max_star_horizontal_aligned16.h18
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc78
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.h18
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.cc59
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_deinterleave_16s_aligned16.cc76
-rw-r--r--volk/lib/qa_16sc_deinterleave_16s_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_deinterleave_32f_aligned16.cc63
-rw-r--r--volk/lib/qa_16sc_deinterleave_32f_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc71
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc123
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc60
-rw-r--r--volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_magnitude_16s_aligned16.cc70
-rw-r--r--volk/lib/qa_16sc_magnitude_16s_aligned16.h18
-rw-r--r--volk/lib/qa_16sc_magnitude_32f_aligned16.cc70
-rw-r--r--volk/lib/qa_16sc_magnitude_32f_aligned16.h18
-rw-r--r--volk/lib/qa_16u_byteswap_aligned16.cc60
-rw-r--r--volk/lib/qa_16u_byteswap_aligned16.h18
-rw-r--r--volk/lib/qa_32f_accumulator_aligned16.cc56
-rw-r--r--volk/lib/qa_32f_accumulator_aligned16.h18
-rw-r--r--volk/lib/qa_32f_add_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_add_aligned16.h18
-rw-r--r--volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc59
-rw-r--r--volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_16s_aligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_16s_aligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_16s_unaligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_16s_unaligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_32s_aligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_32s_aligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_32s_unaligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_32s_unaligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_64f_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_convert_64f_aligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_64f_unaligned16.cc60
-rw-r--r--volk/lib/qa_32f_convert_64f_unaligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_8s_aligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_8s_aligned16.h18
-rw-r--r--volk/lib/qa_32f_convert_8s_unaligned16.cc70
-rw-r--r--volk/lib/qa_32f_convert_8s_unaligned16.h18
-rw-r--r--volk/lib/qa_32f_divide_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_divide_aligned16.h18
-rw-r--r--volk/lib/qa_32f_dot_prod_aligned16.cc183
-rw-r--r--volk/lib/qa_32f_dot_prod_aligned16.h18
-rw-r--r--volk/lib/qa_32f_dot_prod_unaligned16.cc190
-rw-r--r--volk/lib/qa_32f_dot_prod_unaligned16.h18
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.h18
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.cc103
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32f_interleave_16sc_aligned16.cc75
-rw-r--r--volk/lib/qa_32f_interleave_16sc_aligned16.h18
-rw-r--r--volk/lib/qa_32f_interleave_32fc_aligned16.cc62
-rw-r--r--volk/lib/qa_32f_interleave_32fc_aligned16.h18
-rw-r--r--volk/lib/qa_32f_max_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_max_aligned16.h18
-rw-r--r--volk/lib/qa_32f_min_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_min_aligned16.h18
-rw-r--r--volk/lib/qa_32f_multiply_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_multiply_aligned16.h18
-rw-r--r--volk/lib/qa_32f_normalize_aligned16.cc65
-rw-r--r--volk/lib/qa_32f_normalize_aligned16.h18
-rw-r--r--volk/lib/qa_32f_power_aligned16.cc95
-rw-r--r--volk/lib/qa_32f_power_aligned16.h18
-rw-r--r--volk/lib/qa_32f_sqrt_aligned16.cc59
-rw-r--r--volk/lib/qa_32f_sqrt_aligned16.h18
-rw-r--r--volk/lib/qa_32f_stddev_aligned16.cc74
-rw-r--r--volk/lib/qa_32f_stddev_aligned16.h18
-rw-r--r--volk/lib/qa_32f_stddev_and_mean_aligned16.cc75
-rw-r--r--volk/lib/qa_32f_stddev_and_mean_aligned16.h18
-rw-r--r--volk/lib/qa_32f_subtract_aligned16.cc60
-rw-r--r--volk/lib/qa_32f_subtract_aligned16.h18
-rw-r--r--volk/lib/qa_32f_sum_of_poly_aligned16.cc142
-rw-r--r--volk/lib/qa_32f_sum_of_poly_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_32f_multiply_aligned16.cc85
-rw-r--r--volk/lib/qa_32fc_32f_multiply_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_32f_power_32fc_aligned16.cc83
-rw-r--r--volk/lib/qa_32fc_32f_power_32fc_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_atan2_32f_aligned16.cc75
-rw-r--r--volk/lib/qa_32fc_atan2_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc137
-rw-r--r--volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_deinterleave_32f_aligned16.cc63
-rw-r--r--volk/lib/qa_32fc_deinterleave_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_deinterleave_64f_aligned16.cc63
-rw-r--r--volk/lib/qa_32fc_deinterleave_64f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc60
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc60
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc60
-rw-r--r--volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_dot_prod_aligned16.cc214
-rw-r--r--volk/lib/qa_32fc_dot_prod_aligned16.h20
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.cc89
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_magnitude_16s_aligned16.cc70
-rw-r--r--volk/lib/qa_32fc_magnitude_16s_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_magnitude_32f_aligned16.cc70
-rw-r--r--volk/lib/qa_32fc_magnitude_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_multiply_aligned16.cc86
-rw-r--r--volk/lib/qa_32fc_multiply_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc63
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc63
-rw-r--r--volk/lib/qa_32fc_power_spectrum_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_square_dist_aligned16.cc91
-rw-r--r--volk/lib/qa_32fc_square_dist_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc96
-rw-r--r--volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h18
-rw-r--r--volk/lib/qa_32s_and_aligned16.cc60
-rw-r--r--volk/lib/qa_32s_and_aligned16.h18
-rw-r--r--volk/lib/qa_32s_convert_32f_aligned16.cc60
-rw-r--r--volk/lib/qa_32s_convert_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32s_convert_32f_unaligned16.cc60
-rw-r--r--volk/lib/qa_32s_convert_32f_unaligned16.h18
-rw-r--r--volk/lib/qa_32s_or_aligned16.cc60
-rw-r--r--volk/lib/qa_32s_or_aligned16.h18
-rw-r--r--volk/lib/qa_32u_byteswap_aligned16.cc59
-rw-r--r--volk/lib/qa_32u_byteswap_aligned16.h18
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.cc61
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_64f_convert_32f_aligned16.cc60
-rw-r--r--volk/lib/qa_64f_convert_32f_aligned16.h18
-rw-r--r--volk/lib/qa_64f_convert_32f_unaligned16.cc60
-rw-r--r--volk/lib/qa_64f_convert_32f_unaligned16.h18
-rw-r--r--volk/lib/qa_64f_max_aligned16.cc60
-rw-r--r--volk/lib/qa_64f_max_aligned16.h18
-rw-r--r--volk/lib/qa_64f_min_aligned16.cc60
-rw-r--r--volk/lib/qa_64f_min_aligned16.h18
-rw-r--r--volk/lib/qa_64u_byteswap_aligned16.cc59
-rw-r--r--volk/lib/qa_64u_byteswap_aligned16.h18
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.cc61
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_8s_convert_16s_aligned16.cc63
-rw-r--r--volk/lib/qa_8s_convert_16s_aligned16.h18
-rw-r--r--volk/lib/qa_8s_convert_16s_unaligned16.cc63
-rw-r--r--volk/lib/qa_8s_convert_16s_unaligned16.h18
-rw-r--r--volk/lib/qa_8s_convert_32f_aligned16.cc63
-rw-r--r--volk/lib/qa_8s_convert_32f_aligned16.h18
-rw-r--r--volk/lib/qa_8s_convert_32f_unaligned16.cc63
-rw-r--r--volk/lib/qa_8s_convert_32f_unaligned16.h18
-rw-r--r--volk/lib/qa_8sc_deinterleave_16s_aligned16.cc67
-rw-r--r--volk/lib/qa_8sc_deinterleave_16s_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_deinterleave_32f_aligned16.cc134
-rw-r--r--volk/lib/qa_8sc_deinterleave_32f_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc64
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc138
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc60
-rw-r--r--volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc87
-rw-r--r--volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h18
-rw-r--r--volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc87
-rw-r--r--volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h18
-rw-r--r--volk/lib/qa_volk.cc211
-rw-r--r--volk/lib/qa_volk.h36
-rw-r--r--volk/lib/test_all.cc82
-rw-r--r--volk/lib/volk_rank_archs.c13
-rw-r--r--volk/lib/volk_rank_archs.h14
183 files changed, 9136 insertions, 0 deletions
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
new file mode 100644
index 000000000..97eb75680
--- /dev/null
+++ b/volk/lib/Makefile.am
@@ -0,0 +1,361 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS)
+
+
+# We build 2 libraries and 1 executable here. One library contains
+# everything except the libcppunit QA code, and one contains only the
+# libcppunit-based QA code. The C++ QA code is especially recommended
+# when you have general purpose C or C++ code that may not get
+# thoroughly exercised by building and running a GR block. The
+# executable runs the QA code at "make check" time.
+#
+# N.B., If there's a SWIG generated shared library and associated
+# python code, it will be contained in ../python, not here. (That
+# code is conditionally built depending on the state of the
+# --without-python configure option.) However, the .i should be here
+# next to the .h that it's based on.
+
+
+# list of programs run by "make check" and "make distcheck"
+TESTS = test_all
+
+
+lib_LTLIBRARIES = \
+ libvolk.la \
+ libvolk_runtime.la \
+ libvolk_qa.la
+
+
+# ----------------------------------------------------------------
+# The main library
+# ----------------------------------------------------------------
+
+universal_runtime_CODE = \
+ volk_runtime.c \
+ volk_init.c \
+ volk_rank_archs.c
+
+universal_CODE = \
+ volk.c \
+ volk_environment_init.c
+
+generic_CODE = \
+ volk_cpu_generic.cc
+
+x86_CODE = \
+ volk_cpu_x86.c
+
+x86_SUBCODE = \
+ cpuid_x86.S
+
+x86_64_SUBCODE = \
+ cpuid_x86_64.S
+
+powerpc_CODE = \
+ volk_cpu_powerpc.cc
+
+
+if MD_CPU_generic
+libvolk_la_SOURCES = \
+ $(generic_CODE) \
+ $(universal_CODE)
+libvolk_runtime_la_SOURCES = \
+ $(generic_CODE) \
+ $(universal_runtime_CODE)
+
+endif
+
+if MD_CPU_x86
+if MD_SUBCPU_x86_64
+libvolk_la_SOURCES = \
+ $(x86_CODE) \
+ $(x86_64_SUBCODE) \
+ $(universal_CODE)
+
+libvolk_runtime_la_SOURCES = \
+ $(x86_CODE) \
+ $(x86_64_SUBCODE) \
+ $(universal_runtime_CODE)
+else
+libvolk_la_SOURCES = \
+ $(x86_CODE) \
+ $(x86_SUBCODE) \
+ $(universal_CODE)
+
+libvolk_runtime_la_SOURCES = \
+ $(x86_CODE) \
+ $(x86_SUBCODE) \
+ $(universal_runtime_CODE)
+endif
+endif
+
+
+if MD_CPU_powerpc
+libvolk_la_SOURCES = \
+ $(powerpc_CODE) \
+ $(universal_CODE)
+
+libvolk_runtime_la_SOURCES = \
+ $(powerpc_CODE) \
+ $(universal_runtime_CODE)
+endif
+
+
+
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+
+libvolk_la_LIBADD =
+
+
+
+# ----------------------------------------------------------------
+# The QA library. Note libvolk.la in LIBADD
+# ----------------------------------------------------------------
+libvolk_qa_la_SOURCES = \
+ qa_volk.cc \
+ qa_16s_quad_max_star_aligned16.cc \
+ qa_32fc_dot_prod_aligned16.cc \
+ qa_32fc_square_dist_aligned16.cc \
+ qa_32fc_square_dist_scalar_mult_aligned16.cc \
+ qa_32f_sum_of_poly_aligned16.cc \
+ qa_32fc_index_max_aligned16.cc \
+ qa_32f_index_max_aligned16.cc \
+ qa_32fc_conjugate_dot_prod_aligned16.cc \
+ qa_16s_permute_and_scalar_add_aligned16.cc \
+ qa_16s_branch_4_state_8_aligned16.cc \
+ qa_16s_max_star_horizontal_aligned16.cc \
+ qa_16s_max_star_aligned16.cc \
+ qa_16s_add_quad_aligned16.cc \
+ qa_32f_add_aligned16.cc \
+ qa_32f_subtract_aligned16.cc \
+ qa_32f_max_aligned16.cc \
+ qa_32f_min_aligned16.cc \
+ qa_64f_max_aligned16.cc \
+ qa_64f_min_aligned16.cc \
+ qa_32s_and_aligned16.cc \
+ qa_32s_or_aligned16.cc \
+ qa_32f_dot_prod_aligned16.cc \
+ qa_32f_dot_prod_unaligned16.cc \
+ qa_32f_fm_detect_aligned16.cc \
+ qa_32fc_32f_multiply_aligned16.cc \
+ qa_32fc_multiply_aligned16.cc \
+ qa_32f_divide_aligned16.cc \
+ qa_32f_multiply_aligned16.cc \
+ qa_32f_sqrt_aligned16.cc \
+ qa_8sc_multiply_conjugate_16sc_aligned16.cc \
+ qa_8sc_multiply_conjugate_32fc_aligned16.cc \
+ qa_32u_popcnt_aligned16.cc \
+ qa_64u_popcnt_aligned16.cc \
+ qa_64u_byteswap_aligned16.cc \
+ qa_8sc_deinterleave_32f_aligned16.cc \
+ qa_16sc_deinterleave_32f_aligned16.cc \
+ qa_8sc_deinterleave_16s_aligned16.cc \
+ qa_32f_interleave_32fc_aligned16.cc \
+ qa_16u_byteswap_aligned16.cc \
+ qa_16sc_deinterleave_16s_aligned16.cc \
+ qa_32fc_deinterleave_real_32f_aligned16.cc \
+ qa_32fc_magnitude_32f_aligned16.cc \
+ qa_32fc_deinterleave_real_64f_aligned16.cc \
+ qa_32fc_deinterleave_real_16s_aligned16.cc \
+ qa_32fc_magnitude_16s_aligned16.cc \
+ qa_32fc_deinterleave_32f_aligned16.cc \
+ qa_8sc_deinterleave_real_8s_aligned16.cc \
+ qa_32fc_deinterleave_64f_aligned16.cc \
+ qa_32f_interleave_16sc_aligned16.cc \
+ qa_16sc_deinterleave_real_8s_aligned16.cc \
+ qa_16sc_deinterleave_real_32f_aligned16.cc \
+ qa_16sc_magnitude_32f_aligned16.cc \
+ qa_32u_byteswap_aligned16.cc \
+ qa_16sc_deinterleave_real_16s_aligned16.cc \
+ qa_8sc_deinterleave_real_32f_aligned16.cc \
+ qa_16sc_magnitude_16s_aligned16.cc \
+ qa_32f_normalize_aligned16.cc \
+ qa_8sc_deinterleave_real_16s_aligned16.cc \
+ qa_16s_convert_32f_aligned16.cc \
+ qa_16s_convert_32f_unaligned16.cc \
+ qa_16s_convert_8s_aligned16.cc \
+ qa_16s_convert_8s_unaligned16.cc \
+ qa_32f_convert_16s_aligned16.cc \
+ qa_32f_convert_16s_unaligned16.cc \
+ qa_32f_convert_32s_aligned16.cc \
+ qa_32f_convert_32s_unaligned16.cc \
+ qa_32f_convert_64f_aligned16.cc \
+ qa_32f_convert_64f_unaligned16.cc \
+ qa_32f_convert_8s_aligned16.cc \
+ qa_32f_convert_8s_unaligned16.cc \
+ qa_32s_convert_32f_aligned16.cc \
+ qa_32s_convert_32f_unaligned16.cc \
+ qa_64f_convert_32f_aligned16.cc \
+ qa_64f_convert_32f_unaligned16.cc \
+ qa_8s_convert_16s_aligned16.cc \
+ qa_8s_convert_16s_unaligned16.cc \
+ qa_8s_convert_32f_aligned16.cc \
+ qa_8s_convert_32f_unaligned16.cc \
+ qa_32fc_32f_power_32fc_aligned16.cc \
+ qa_32f_power_aligned16.cc \
+ qa_32fc_atan2_32f_aligned16.cc \
+ qa_32fc_power_spectral_density_32f_aligned16.cc \
+ qa_32fc_power_spectrum_32f_aligned16.cc \
+ qa_32f_calc_spectral_noise_floor_aligned16.cc \
+ qa_32f_accumulator_aligned16.cc \
+ qa_32f_stddev_aligned16.cc \
+ qa_32f_stddev_and_mean_aligned16.cc
+
+libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+
+libvolk_qa_la_LIBADD = \
+ libvolk.la \
+ libvolk_runtime.la \
+ $(CPPUNIT_LIBS)
+
+# ----------------------------------------------------------------
+# headers that don't get installed
+# ----------------------------------------------------------------
+noinst_HEADERS = \
+ volk_init.h \
+ qa_volk.h \
+ qa_16s_quad_max_star_aligned16.h \
+ qa_32fc_dot_prod_aligned16.h \
+ qa_32fc_square_dist_aligned16.h \
+ qa_32fc_square_dist_scalar_mult_aligned16.h \
+ qa_32f_sum_of_poly_aligned16.h \
+ qa_32fc_index_max_aligned16.h \
+ qa_32f_index_max_aligned16.h \
+ qa_32fc_conjugate_dot_prod_aligned16.h \
+ qa_16s_permute_and_scalar_add_aligned16.h \
+ qa_16s_branch_4_state_8_aligned16.h \
+ qa_16s_max_star_horizontal_aligned16.h \
+ qa_16s_max_star_aligned16.h \
+ qa_16s_add_quad_aligned16.h \
+ qa_32f_add_aligned16.h \
+ qa_32f_subtract_aligned16.h \
+ qa_32f_max_aligned16.h \
+ qa_32f_min_aligned16.h \
+ qa_64f_max_aligned16.h \
+ qa_64f_min_aligned16.h \
+ qa_32s_and_aligned16.h \
+ qa_32s_or_aligned16.h \
+ qa_32f_dot_prod_aligned16.h \
+ qa_32f_dot_prod_unaligned16.h \
+ qa_32f_fm_detect_aligned16.h \
+ qa_32fc_32f_multiply_aligned16.h \
+ qa_32fc_multiply_aligned16.h \
+ qa_32f_divide_aligned16.h \
+ qa_32f_multiply_aligned16.h \
+ qa_32f_sqrt_aligned16.h \
+ qa_8sc_multiply_conjugate_16sc_aligned16.h \
+ qa_8sc_multiply_conjugate_32fc_aligned16.h \
+ qa_32u_popcnt_aligned16.h \
+ qa_64u_popcnt_aligned16.h \
+ qa_64u_byteswap_aligned16.h \
+ qa_8sc_deinterleave_32f_aligned16.h \
+ qa_16sc_deinterleave_32f_aligned16.h \
+ qa_8sc_deinterleave_16s_aligned16.h \
+ qa_32f_interleave_32fc_aligned16.h \
+ qa_16u_byteswap_aligned16.h \
+ qa_16sc_deinterleave_16s_aligned16.h \
+ qa_32fc_deinterleave_real_32f_aligned16.h \
+ qa_32fc_magnitude_32f_aligned16.h \
+ qa_32fc_deinterleave_real_64f_aligned16.h \
+ qa_32fc_deinterleave_real_16s_aligned16.h \
+ qa_32fc_magnitude_16s_aligned16.h \
+ qa_32fc_deinterleave_32f_aligned16.h \
+ qa_8sc_deinterleave_real_8s_aligned16.h \
+ qa_32fc_deinterleave_64f_aligned16.h \
+ qa_32f_interleave_16sc_aligned16.h \
+ qa_16sc_deinterleave_real_8s_aligned16.h \
+ qa_16sc_deinterleave_real_32f_aligned16.h \
+ qa_16sc_magnitude_32f_aligned16.h \
+ qa_32u_byteswap_aligned16.h \
+ qa_16sc_deinterleave_real_16s_aligned16.h \
+ qa_8sc_deinterleave_real_32f_aligned16.h \
+ qa_16sc_magnitude_16s_aligned16.h \
+ qa_32f_normalize_aligned16.h \
+ qa_8sc_deinterleave_real_16s_aligned16.h \
+ qa_16s_convert_32f_aligned16.h \
+ qa_16s_convert_32f_unaligned16.h \
+ qa_16s_convert_8s_aligned16.h \
+ qa_16s_convert_8s_unaligned16.h \
+ qa_32f_convert_16s_aligned16.h \
+ qa_32f_convert_16s_unaligned16.h \
+ qa_32f_convert_32s_aligned16.h \
+ qa_32f_convert_32s_unaligned16.h \
+ qa_32f_convert_64f_aligned16.h \
+ qa_32f_convert_64f_unaligned16.h \
+ qa_32f_convert_8s_aligned16.h \
+ qa_32f_convert_8s_unaligned16.h \
+ qa_32s_convert_32f_aligned16.h \
+ qa_32s_convert_32f_unaligned16.h \
+ qa_64f_convert_32f_aligned16.h \
+ qa_64f_convert_32f_unaligned16.h \
+ qa_8s_convert_16s_aligned16.h \
+ qa_8s_convert_16s_unaligned16.h \
+ qa_8s_convert_32f_aligned16.h \
+ qa_8s_convert_32f_unaligned16.h \
+ qa_32fc_32f_power_32fc_aligned16.h \
+ qa_32f_power_aligned16.h \
+ qa_32fc_atan2_32f_aligned16.h \
+ qa_32fc_power_spectral_density_32f_aligned16.h \
+ qa_32fc_power_spectrum_32f_aligned16.h \
+ qa_32f_calc_spectral_noise_floor_aligned16.h \
+ qa_32f_accumulator_aligned16.h \
+ qa_32f_stddev_aligned16.h \
+ qa_32f_stddev_and_mean_aligned16.h
+
+
+# ----------------------------------------------------------------
+# Our test program
+# ----------------------------------------------------------------
+noinst_PROGRAMS = \
+ test_all
+
+test_all_SOURCES = test_all.cc
+test_all_LDADD = libvolk_qa.la
+
+
+distclean-local:
+ rm -f volk.c
+ rm -f volk_cpu_generic.c
+ rm -f volk_cpu_powerpc.c
+ rm -f volk_cpu_x86.c
+ rm -f volk_init.c
+ rm -f volk_init.h
+ rm -f volk_mktables
+ rm -f volk_mktables.c
+ rm -f volk_proccpu_sim.c
+ rm -f volk_runtime.c
+ rm -f volk_tables.h
+ rm -f volk_environment_init.c
+#SUBDIRS =
+
+#ifdef BUILD_SSE
+#SUBDIRS += sse
+#elif BUILD_SPU
+#SUBDIRS += spu
+#else
+#SUBDIRS += port
+#endif
+
+
diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h
new file mode 100644
index 000000000..8a99aa07c
--- /dev/null
+++ b/volk/lib/assembly.h
@@ -0,0 +1,67 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2002 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _ASSEMBLY_H_
+#define _ASSEMBLY_H_
+
+#if defined (__APPLE__) && defined (__APPLE_CC__)
+
+// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3,
+// but creates an error in XCode 2.4. Just ignore them.
+
+#define GLOB_SYMB(f) _ ## f
+
+#define DEF_FUNC_HEAD(f) /* none */
+
+#define FUNC_TAIL(f) /* none*/
+
+#elif !defined (__ELF__)
+
+/*
+ * Too bad, the following define does not work as expected --SF
+ * #define GLOB_SYMB(f) __USER_LABEL_PREFIX__ ## f
+ */
+#define GLOB_SYMB(f) _ ## f
+
+#define DEF_FUNC_HEAD(f) \
+ .def GLOB_SYMB(f); .scl 2; .type 32; .endef
+
+#define FUNC_TAIL(f) /* none */
+
+
+#else /* !__ELF__ */
+
+
+#define GLOB_SYMB(f) f
+
+#define DEF_FUNC_HEAD(f) \
+ .type GLOB_SYMB(f),@function \
+
+#define FUNC_TAIL(f) \
+ .Lfe1: \
+ .size GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f)
+
+
+#endif /* !__ELF__ */
+
+
+#endif /* _ASSEMBLY_H_ */
diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S
new file mode 100644
index 000000000..4e1a9404f
--- /dev/null
+++ b/volk/lib/cpuid_x86.S
@@ -0,0 +1,60 @@
+#
+# Copyright 2003 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+#
+# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result
+#
+# void cpuid_x86 (unsigned int op, unsigned int result[4]);
+#
+
+#include "assembly.h"
+
+.file "cpuid_x86.S"
+ .version "01.01"
+.text
+.globl GLOB_SYMB(cpuid_x86)
+ DEF_FUNC_HEAD(cpuid_x86)
+GLOB_SYMB(cpuid_x86):
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx # must save in PIC mode, holds GOT pointer
+ pushl %esi
+
+ movl 8(%ebp), %eax # op
+ movl 12(%ebp), %esi # result
+ cpuid
+ movl %eax, 0(%esi)
+ movl %ebx, 4(%esi)
+ movl %ecx, 8(%esi)
+ movl %edx, 12(%esi)
+
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+
+FUNC_TAIL(cpuid_x86)
+ .ident "Hand coded cpuid assembly"
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S
new file mode 100644
index 000000000..32b1847cd
--- /dev/null
+++ b/volk/lib/cpuid_x86_64.S
@@ -0,0 +1,54 @@
+#
+# Copyright 2003,2005 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+#
+# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result
+#
+# void cpuid_x86 (unsigned int op, unsigned int result[4]);
+#
+
+#include "assembly.h"
+
+.file "cpuid_x86_64.S"
+ .version "01.01"
+.text
+.globl GLOB_SYMB(cpuid_x86)
+ DEF_FUNC_HEAD(cpuid_x86)
+GLOB_SYMB(cpuid_x86):
+ mov %rbx, %r11 # must save in PIC mode, holds GOT pointer
+
+ mov %rdi, %rax # op
+ cpuid
+ movl %eax, 0(%rsi) # result
+ movl %ebx, 4(%rsi)
+ movl %ecx, 8(%rsi)
+ movl %edx, 12(%rsi)
+
+ mov %r11, %rbx
+ retq
+
+FUNC_TAIL(cpuid_x86)
+ .ident "Hand coded cpuid64 assembly"
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..c3005c1be
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3200;
+ const int ITERS = 100000;
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short input1[vlen] __attribute__ ((aligned (16)));
+ short input2[vlen] __attribute__ ((aligned (16)));
+ short input3[vlen] __attribute__ ((aligned (16)));
+ short input4[vlen] __attribute__ ((aligned (16)));
+
+ short output0[vlen] __attribute__ ((aligned (16)));
+ short output1[vlen] __attribute__ ((aligned (16)));
+ short output2[vlen] __attribute__ ((aligned (16)));
+ short output3[vlen] __attribute__ ((aligned (16)));
+ short output01[vlen] __attribute__ ((aligned (16)));
+ short output11[vlen] __attribute__ ((aligned (16)));
+ short output21[vlen] __attribute__ ((aligned (16)));
+ short output31[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ input4[i] = plus4 - minus4;
+
+ }
+ printf("16s_add_quad_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+ CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+ CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..ba5e8ed93
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ const int num_iters = 1000000;
+ const int vlen = 32;
+
+ static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+ static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+ static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+ static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+ static char* permuters[4] = {permute0, permute1, permute2, permute3};
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ short target[vlen] __attribute__ ((aligned (16)));
+ short target2[vlen] __attribute__ ((aligned (16)));
+ short target3[vlen] __attribute__ ((aligned (16)));
+
+ short src0[vlen] __attribute__ ((aligned (16)));
+ short permute_indexes[vlen] __attribute__ ((aligned (16))) = {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+ short cntl0[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ short cntl1[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ short cntl2[vlen] __attribute__ ((aligned (16))) = {
+ 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+ short cntl3[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+ short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+
+ }
+
+
+ printf("16s_branch_4_state_8_aligned\n");
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time: %f\n", total);
+
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("branch_4_state_8_time, ssse3: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time, generic: %f\n", total);
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ CPPUNIT_ASSERT(target[i] == target3[i]);
+ }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..7878d4737
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_aligned16.cc
@@ -0,0 +1,73 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_convert_32f_aligned16.h>
+#include <volk/volk_16s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE
+
+void qa_16s_convert_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_32f_aligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int16_t input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("16s_convert_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h
new file mode 100644
index 000000000..ef813d96f
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..8c3121e5c
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc
@@ -0,0 +1,73 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_convert_32f_unaligned16.h>
+#include <volk/volk_16s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE
+
+void qa_16s_convert_32f_unaligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_32f_unaligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int16_t input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("16s_convert_32f_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..aeb04f770
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc
new file mode 100644
index 000000000..734b7784e
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_convert_8s_aligned16.h>
+#include <volk/volk_16s_convert_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_convert_8s_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_8s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int16_t input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("16s_convert_8s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h
new file mode 100644
index 000000000..2e409d0cc
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc
new file mode 100644
index 000000000..275ab7668
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_convert_8s_unaligned16.h>
+#include <volk/volk_16s_convert_8s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_convert_8s_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_8s_unaligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int16_t input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("16s_convert_8s_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h
new file mode 100644
index 000000000..4b2fe9e42
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */
diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc
new file mode 100644
index 000000000..b46b9ae8e
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_aligned16.cc
@@ -0,0 +1,65 @@
+#include <volk/volk.h>
+#include <qa_16s_max_star_aligned16.h>
+#include <volk/volk_16s_max_star_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_max_star_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_max_star_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 6400;
+ const int ITERS = 100000;
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short output0[1] __attribute__ ((aligned (16)));
+
+ short output1[1] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ input0[i] = plus0 - minus0;
+
+ }
+ printf("16s_max_star_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < 1; ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h
new file mode 100644
index 000000000..119f87c4d
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_max_star_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc
new file mode 100644
index 000000000..4d44735df
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc
@@ -0,0 +1,79 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_max_star_horizontal_aligned16.h>
+#include <volk/volk_16s_max_star_horizontal_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_max_star_horizontal_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_16s_max_star_horizontal_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 32;
+ const int ITERS = 1;
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short output0[vlen>>1] __attribute__ ((aligned (16)));
+
+ short output1[vlen>>1] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2)));
+
+ short minus0 = ((short) (rand() - (RAND_MAX/2)));
+
+ input0[i] = plus0 - minus0;
+
+ }
+ printf("16s_max_star_horizontal_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic");
+ volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic");
+ volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+
+ get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen);
+ get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen);
+ get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen);
+ /* volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3");
+ volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");
+ volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+
+ for(int i = 0; i < (vlen >> 1); ++i) {
+ // printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+
+ }
+ for(int i = 0; i < (vlen >> 1); ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+ }
+
+
+#endif
+
diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h
new file mode 100644
index 000000000..9f9757253
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H
+#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..3c4f5c6cc
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ const int vlen = 64;
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ short target[vlen] __attribute__ ((aligned (16)));
+ short target2[vlen] __attribute__ ((aligned (16)));
+ short src0[vlen] __attribute__ ((aligned (16)));
+ short permute_indexes[vlen] __attribute__ ((aligned (16)));
+ short cntl0[vlen] __attribute__ ((aligned (16)));
+ short cntl1[vlen] __attribute__ ((aligned (16)));
+ short cntl2[vlen] __attribute__ ((aligned (16)));
+ short cntl3[vlen] __attribute__ ((aligned (16)));
+ short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+ permute_indexes[i] = (3 * i)%vlen;
+ cntl0[i] = 0xff;
+ cntl1[i] = 0xff * (i%2);
+ cntl2[i] = 0xff * ((i>>1)%2);
+ cntl3[i] = 0xff * ((i%4) == 3);
+ }
+
+ printf("16s_permute_and_scalar_add_aligned\n");
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("sse2_time: %f\n", total);
+
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..80a220c93
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ const int vlen = 34;
+
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short input1[vlen] __attribute__ ((aligned (16)));
+ short input2[vlen] __attribute__ ((aligned (16)));
+ short input3[vlen] __attribute__ ((aligned (16)));
+
+ short output0[vlen] __attribute__ ((aligned (16)));
+ short output1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = (short) (rand() - (RAND_MAX/2));
+ short plus1 = (short) (rand() - (RAND_MAX/2));
+ short plus2 = (short) (rand() - (RAND_MAX/2));
+ short plus3 = (short) (rand() - (RAND_MAX/2));
+
+ short minus0 = (short) (rand() - (RAND_MAX/2));
+ short minus1 = (short) (rand() - (RAND_MAX/2));
+ short minus2 = (short) (rand() - (RAND_MAX/2));
+ short minus3 = (short) (rand() - (RAND_MAX/2));
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ }
+
+ volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+ volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+ printf("16s_quad_max_star_aligned\n");
+ for(int i = 0; i < vlen; ++i) {
+ printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
new file mode 100644
index 000000000..e700ac72c
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
@@ -0,0 +1,76 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_16s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_16s_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse21[vlen] __attribute__ ((aligned (16)));
+ int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+ int16_t output_ssse31[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0));
+ }
+ printf("16sc_deinterleave_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse21[i]);
+
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..995ab5b34
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..6ee076998
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_32f_aligned16.h>
+#include <volk/volk_16sc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16sc_deinterleave_32f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_generic1[vlen] __attribute__ ((aligned (16)));
+ float output_sse2[vlen] __attribute__ ((aligned (16)));
+ float output_sse21[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+ }
+ printf("16sc_deinterleave_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..fea3b6c2d
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..ca048ea67
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,71 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_real_16s_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+ int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0));
+ }
+ printf("16sc_deinterleave_real_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..ebb70b97a
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..0f4ba6923
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,123 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+ }
+ printf("16sc_deinterleave_real_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif /* SSE */
+
+#else
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0);
+ }
+ printf("16sc_deinterleave_real_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif /* SSE4_1 */
diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..e83426473
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
new file mode 100644
index 000000000..5ab458bc9
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_8s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_real_8s_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_8s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0);
+ }
+ printf("16sc_deinterleave_real_8s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..04e5511e5
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
new file mode 100644
index 000000000..b14610757
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_16sc_magnitude_16s_aligned16.h>
+#include <volk/volk_16sc_magnitude_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_16sc_magnitude_16s_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_magnitude_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* loadInput = (int16_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+ }
+ printf("16sc_magnitude_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..4664b70f4
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
new file mode 100644
index 000000000..06dff2fd5
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_16sc_magnitude_32f_aligned16.h>
+#include <volk/volk_16sc_magnitude_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_16sc_magnitude_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_magnitude_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ int16_t* inputLoad = (int16_t*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("16sc_magnitude_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..0c25673ea
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc
new file mode 100644
index 000000000..6b19828a4
--- /dev/null
+++ b/volk/lib/qa_16u_byteswap_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16u_byteswap_aligned16.h>
+#include <volk/volk_16u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16u_byteswap_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16u_byteswap_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100001;
+
+ uint16_t output0[vlen] __attribute__ ((aligned (16)));
+ uint16_t output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+ }
+ memcpy(output01, output0, vlen*sizeof(uint16_t));
+
+ printf("16u_byteswap_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16u_byteswap_aligned16_manual(output0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h
new file mode 100644
index 000000000..e11b23e3f
--- /dev/null
+++ b/volk/lib/qa_16u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16u_byteswap_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc
new file mode 100644
index 000000000..ea637d600
--- /dev/null
+++ b/volk/lib/qa_32f_accumulator_aligned16.cc
@@ -0,0 +1,56 @@
+#include <volk/volk.h>
+#include <qa_32f_accumulator_aligned16.h>
+#include <volk/volk_32f_accumulator_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_accumulator_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_accumulator_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float accumulator_generic;
+ float accumulator_sse;
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_accumulator_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4);
+}
+
+#endif
diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h
new file mode 100644
index 000000000..0004d3ff0
--- /dev/null
+++ b/volk/lib/qa_32f_accumulator_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H
+#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_accumulator_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc
new file mode 100644
index 000000000..92f35c7ec
--- /dev/null
+++ b/volk/lib/qa_32f_add_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_add_aligned16.h>
+#include <volk/volk_32f_add_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_add_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_add_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_add_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h
new file mode 100644
index 000000000..58e2a151c
--- /dev/null
+++ b/volk/lib/qa_32f_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H
+#define INCLUDED_QA_32F_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_add_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_add_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc
new file mode 100644
index 000000000..3c8137004
--- /dev/null
+++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32f_calc_spectral_noise_floor_aligned16.h>
+#include <volk/volk_32f_calc_spectral_noise_floor_aligned16.h>
+#include <cstdlib>
+#include <math.h>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_calc_spectral_noise_floor_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_calc_spectral_noise_floor_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float output0[1] __attribute__ ((aligned (16)));
+ float output01[1] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_calc_spectral_noise_floor_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h
new file mode 100644
index 000000000..c5dce2c4b
--- /dev/null
+++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc
new file mode 100644
index 000000000..84a4c40c4
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_16s_aligned16.h>
+#include <volk/volk_32f_convert_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_16s_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h
new file mode 100644
index 000000000..fce1eb417
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc
new file mode 100644
index 000000000..9469daed2
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_16s_unaligned16.h>
+#include <volk/volk_32f_convert_16s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_16s_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_16s_unaligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_16s_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h
new file mode 100644
index 000000000..492bc80e6
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc
new file mode 100644
index 000000000..ff24c7b0d
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_32s_aligned16.h>
+#include <volk/volk_32f_convert_32s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_32s_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_32s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int32_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int32_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int32_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_32s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h
new file mode 100644
index 000000000..97d854463
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc
new file mode 100644
index 000000000..e63b17994
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_32s_unaligned16.h>
+#include <volk/volk_32f_convert_32s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_32s_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_32s_unaligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int32_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int32_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int32_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_32s_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h
new file mode 100644
index 000000000..5d662d86d
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc
new file mode 100644
index 000000000..c546e47de
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_64f_aligned16.h>
+#include <volk/volk_32f_convert_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_64f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_64f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ double output_generic[vlen] __attribute__ ((aligned (16)));
+ double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_64f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h
new file mode 100644
index 000000000..41eb3e094
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc
new file mode 100644
index 000000000..24b51f9af
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_64f_unaligned16.h>
+#include <volk/volk_32f_convert_64f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_64f_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_64f_unaligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ double output_generic[vlen] __attribute__ ((aligned (16)));
+ double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_64f_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h
new file mode 100644
index 000000000..4b144f033
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc
new file mode 100644
index 000000000..a3d4d6567
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_8s_aligned16.h>
+#include <volk/volk_32f_convert_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_8s_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_8s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_8s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h
new file mode 100644
index 000000000..68a523f34
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc
new file mode 100644
index 000000000..d885fd6bb
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_8s_unaligned16.h>
+#include <volk/volk_32f_convert_8s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_8s_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_8s_unaligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_convert_8s_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+ CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h
new file mode 100644
index 000000000..88d4ff42a
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc
new file mode 100644
index 000000000..b20999beb
--- /dev/null
+++ b/volk/lib/qa_32f_divide_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_divide_aligned16.h>
+#include <volk/volk_32f_divide_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_divide_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_divide_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_divide_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h
new file mode 100644
index 000000000..79d5ae4b8
--- /dev/null
+++ b/volk/lib/qa_32f_divide_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H
+#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_divide_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc
new file mode 100644
index 000000000..98c1f2d99
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_aligned16.cc
@@ -0,0 +1,183 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifdef LV_HAVE_SSE3
+void qa_32f_dot_prod_aligned16::t1() {
+ const int vlen = 2046;
+ const int ITER = 100000;
+
+ int i;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ float * input;
+ float * taps;
+
+ float * result_generic;
+ float * result_sse;
+ float * result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen* sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+
+ random_floats((float*)input, vlen);
+ random_floats((float*)taps, vlen);
+
+
+ printf("32f_dot_prod_aligned16\n");
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]);
+
+ for(i = 0; i < ITER; i++){
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse);
+ free(result_sse3);
+
+}
+#else
+void qa_32f_dot_prod_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#else
+
+void qa_32f_dot_prod_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 4095;
+ const int ITER = 100000;
+
+ int i;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ float * input;
+ float * taps;
+
+ float * result_generic;
+ float * result_sse;
+ float * result_sse3;
+ float * result_sse4_1;
+
+ ret = posix_memalign((void**)&input, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float));
+
+ random_floats((float*)input, vlen);
+ random_floats((float*)taps, vlen);
+
+ printf("32f_dot_prod_aligned16\n");
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]);
+ for(i =0; i < ITER; i++){
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse);
+ free(result_sse3);
+ free(result_sse4_1);
+
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h
new file mode 100644
index 000000000..6931a9e98
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc
new file mode 100644
index 000000000..8e97d4249
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_unaligned16.cc
@@ -0,0 +1,190 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_dot_prod_unaligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifdef LV_HAVE_SSE3
+void qa_32f_dot_prod_unaligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 2046;
+ const int ITER = 100000;
+
+ int i;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ float * input;
+ float * taps;
+
+ float * result_generic;
+ float * result_sse;
+ float * result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen* sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+
+ random_floats((float*)input, vlen);
+ random_floats((float*)taps, vlen);
+
+
+ printf("32f_dot_prod_unaligned16\n");
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]);
+
+ for(i = 0; i < ITER; i++){
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse);
+ free(result_sse3);
+
+}
+#else
+void qa_32f_dot_prod_unaligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#else
+
+void qa_32f_dot_prod_unaligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 4095;
+ const int ITER = 100000;
+
+ int i;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ float * input;
+ float * taps;
+
+ float * result_generic;
+ float * result_sse;
+ float * result_sse3;
+ float * result_sse4_1;
+
+ ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+ ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float));
+
+ input = &input[1]; // Make sure the buffer is unaligned
+ taps = &taps[1]; // Make sure the buffer is unaligned
+
+ random_floats((float*)input, vlen);
+ random_floats((float*)taps, vlen);
+
+ printf("32f_dot_prod_unaligned16\n");
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ start = clock();
+ for(i = 0; i < ITER; i++){
+ get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]);
+ for(i =0; i < ITER; i++){
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA);
+ }
+
+ free(&input[-1]);
+ free(&taps[-1]);
+ free(result_generic);
+ free(result_sse);
+ free(result_sse3);
+ free(result_sse4_1);
+
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h
new file mode 100644
index 000000000..e8bad07fe
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H
+#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..ca65add28
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_fm_detect_aligned\n");
+
+ start = clock();
+ float save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..a1c3d4cd1
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target_sse4_1;
+ unsigned int* target_sse;
+ unsigned int* target_generic;
+ float* src0 ;
+
+
+ unsigned int i_target_sse4_1;
+ target_sse4_1 = &i_target_sse4_1;
+ unsigned int i_target_sse;
+ target_sse = &i_target_sse;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+
+ ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+
+ random_floats((float*)src0, vlen);
+
+ printf("32f_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1 time: %f\n", total);
+
+
+ printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc
new file mode 100644
index 000000000..2a937637f
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk.h>
+#include <qa_32f_interleave_16sc_aligned16.h>
+#include <volk/volk_32f_interleave_16sc_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_interleave_16sc_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_interleave_16sc_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ std::complex<int16_t> output_generic[vlen] __attribute__ ((aligned (16)));
+ std::complex<int16_t> output_sse[vlen] __attribute__ ((aligned (16)));
+ std::complex<int16_t> output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_interleave_16sc_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01);
+
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h
new file mode 100644
index 000000000..8d2914817
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_16sc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H
+#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc
new file mode 100644
index 000000000..c22dd1046
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk.h>
+#include <qa_32f_interleave_32fc_aligned16.h>
+#include <volk/volk_32f_interleave_32fc_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_interleave_32fc_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_interleave_32fc_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ std::complex<float> output_generic[vlen] __attribute__ ((aligned (16)));
+ std::complex<float> output_sse[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_interleave_32fc_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h
new file mode 100644
index 000000000..cba518d37
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H
+#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc
new file mode 100644
index 000000000..3ef375176
--- /dev/null
+++ b/volk/lib/qa_32f_max_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_max_aligned16.h>
+#include <volk/volk_32f_max_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_max_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_max_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_max_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h
new file mode 100644
index 000000000..d535479f4
--- /dev/null
+++ b/volk/lib/qa_32f_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc
new file mode 100644
index 000000000..617e18b24
--- /dev/null
+++ b/volk/lib/qa_32f_min_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_min_aligned16.h>
+#include <volk/volk_32f_min_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_min_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_min_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_min_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h
new file mode 100644
index 000000000..90961ac92
--- /dev/null
+++ b/volk/lib/qa_32f_min_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H
+#define INCLUDED_QA_32F_MIN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_min_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_min_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc
new file mode 100644
index 000000000..c77fe97da
--- /dev/null
+++ b/volk/lib/qa_32f_multiply_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_multiply_aligned16.h>
+#include <volk/volk_32f_multiply_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_multiply_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_multiply_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_multiply_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h
new file mode 100644
index 000000000..7032a2ad4
--- /dev/null
+++ b/volk/lib/qa_32f_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_multiply_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc
new file mode 100644
index 000000000..2954fc3ae
--- /dev/null
+++ b/volk/lib/qa_32f_normalize_aligned16.cc
@@ -0,0 +1,65 @@
+#include <volk/volk.h>
+#include <qa_32f_normalize_aligned16.h>
+#include <volk/volk_32f_normalize_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_normalize_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_normalize_aligned16::t1() {
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ const int vlen = 320001;
+ const int ITERS = 100;
+
+ float* output0;
+ float* output01;
+ ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float));
+ ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float));
+
+ for(int i = 0; i < vlen; ++i) {
+ output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ memcpy(output01, output0, vlen*sizeof(float));
+ printf("32f_normalize_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ // printf("%e...%e\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+ }
+
+ free(output0);
+ free(output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h
new file mode 100644
index 000000000..7c421eb82
--- /dev/null
+++ b/volk/lib/qa_32f_normalize_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H
+#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_normalize_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc
new file mode 100644
index 000000000..1b331daeb
--- /dev/null
+++ b/volk/lib/qa_32f_power_aligned16.cc
@@ -0,0 +1,95 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_power_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE
+void qa_32f_power_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 2046;
+ const int ITERS = 10000;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ float* input;
+ int i;
+
+ float* result_generic;
+ float* result_sse;
+ float* result_sse4_1;
+
+ ret = posix_memalign((void**)&input, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float));
+
+ random_floats((float*)input, vlen);
+
+ const float power = 3;
+
+ printf("32f_power_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_power_aligned16_manual(result_generic, input, power, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_power_aligned16_manual(result_sse, input, power, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1_time: %f\n", total);
+
+
+ for(i = 0; i < vlen; i++){
+ //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA);
+ }
+
+ free(input);
+ free(result_generic);
+ free(result_sse);
+
+}
+#else
+void qa_32f_power_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE */
+
diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h
new file mode 100644
index 000000000..d45df4e56
--- /dev/null
+++ b/volk/lib/qa_32f_power_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H
+#define INCLUDED_QA_32F_POWER_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_power_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_power_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc
new file mode 100644
index 000000000..a3e6abc18
--- /dev/null
+++ b/volk/lib/qa_32f_sqrt_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32f_sqrt_aligned16.h>
+#include <volk/volk_32f_sqrt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_sqrt_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_sqrt_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ // No reason to test negative numbers because they result in NaN.
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand()) / static_cast<float>(RAND_MAX));
+ }
+ printf("32f_sqrt_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h
new file mode 100644
index 000000000..e4b99d981
--- /dev/null
+++ b/volk/lib/qa_32f_sqrt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H
+#define INCLUDED_QA_32F_SQRT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_sqrt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc
new file mode 100644
index 000000000..c0f22cdea
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_aligned16.cc
@@ -0,0 +1,74 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_stddev_aligned16.h>
+#include <volk/volk_32f_stddev_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_stddev_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_stddev_aligned16::t1() {
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float stddev_generic;
+ float stddev_sse;
+ float stddev_sse4_1;
+ float mean = 0;
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ mean += input0[i];
+ }
+ mean /= static_cast<float>(vlen);
+
+ printf("32f_stddev_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4);
+
+}
+
+#endif
diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h
new file mode 100644
index 000000000..7f8d7a5fc
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H
+#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_stddev_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc
new file mode 100644
index 000000000..dcad8bcf3
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_stddev_and_mean_aligned16.h>
+#include <volk/volk_32f_stddev_and_mean_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_stddev_and_mean_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_stddev_and_mean_aligned16::t1() {
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float stddev_generic;
+ float stddev_sse;
+ float stddev_sse4_1;
+ float mean_generic;
+ float mean_sse;
+ float mean_sse4_1;
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_stddev_and_mean_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4);
+
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4);
+
+}
+
+#endif
diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h
new file mode 100644
index 000000000..e08bd249a
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H
+#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc
new file mode 100644
index 000000000..a7e1b5ae3
--- /dev/null
+++ b/volk/lib/qa_32f_subtract_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_subtract_aligned16.h>
+#include <volk/volk_32f_subtract_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_subtract_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_subtract_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+ float input1[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_subtract_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h
new file mode 100644
index 000000000..97c14f129
--- /dev/null
+++ b/volk/lib/qa_32f_subtract_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H
+#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_subtract_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc
new file mode 100644
index 000000000..494776357
--- /dev/null
+++ b/volk/lib/qa_32f_sum_of_poly_aligned16.cc
@@ -0,0 +1,142 @@
+#include <volk/volk.h>
+#include <qa_32f_sum_of_poly_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#define SNR 30.0
+#define CENTER -4.0
+#define CUTOFF -5.595
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 100000
+#define VEC_LEN 64
+static float uniform() {
+ return ((float) rand() / RAND_MAX); // uniformly (0, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * -SNR/2.0;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32f_sum_of_poly_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_sum_of_poly_aligned16::t1(){
+ int i = 0;
+
+ volk_environment_init();
+ int ret;
+
+ const int vlen = VEC_LEN;
+ float cutoff = CUTOFF;
+
+ float* center_point_array;
+ float* target;
+ float* target_generic;
+ float* src0 ;
+
+
+ ret = posix_memalign((void**)&center_point_array, 16, 24);
+ ret = posix_memalign((void**)&target, 16, 4);
+ ret = posix_memalign((void**)&target_generic, 16, 4);
+ ret = posix_memalign((void**)&src0, 16, (vlen << 2));
+
+
+ random_floats((float*)src0, vlen);
+
+ float a = (float)CENTER;
+ float etoa = expf(a);
+ center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 +
+ (-4.0 * a * a * a)/24.0 +
+ (3.0 * a * a)/6.0 +
+ (-2.0 * a)/2.0 +
+ (1.0)) * etoa;
+ center_point_array[1] = (//(-10.0 * a * a * a)/120.0 +
+ (6.0 * a * a)/24.0 +
+ (-3.0 * a)/6.0 +
+ (1.0/2.0)) * etoa;
+ center_point_array[2] = (//(10.0 * a * a)/120.0 +
+ (-4.0 * a)/24.0 +
+ (1.0/6.0)) * etoa;
+ center_point_array[3] = (//(-5.0 * a)/120.0 +
+ (1.0/24.0)) * etoa;
+ //center_point_array[4] = ((1.0)/120.0) * etoa;
+ center_point_array[4] = (//(a * a * a * a * a)/120.0 +
+ (a * a * a * a)/24.0 +
+ (a * a * a)/-6.0 +
+ (a * a)/2.0 +
+ -a + 1.0) * etoa;
+
+ printf("32f_sum_of_poly_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+ float my_sum = 0.0;
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ float sum = 0.0;
+ for(int l = 0; l < vlen; ++l) {
+
+ sum += expf(src0[l]);
+
+ }
+ my_sum = sum;
+ }
+
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("exp time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+
+ volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic");
+
+ }
+
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 approx time: %f\n", total);
+
+
+
+ printf("exp: %f, sse3: %f\n", my_sum, target[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA);
+
+
+ free(center_point_array);
+ free(target);
+ free(target_generic);
+ free(src0);
+
+
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h
new file mode 100644
index 000000000..67a347f9a
--- /dev/null
+++ b/volk/lib/qa_32f_sum_of_poly_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H
+#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
new file mode 100644
index 000000000..4eba0a3cd
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
@@ -0,0 +1,85 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_32f_multiply_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE3
+void qa_32fc_32f_multiply_aligned16::t1() {
+
+ const int vlen = 2046;
+ const int ITERS = 100000;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ float * taps;
+ int i;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float));
+
+ random_floats((float*)input, vlen * 2);
+ random_floats(taps, vlen);
+
+ printf("32fc_32f_multiply_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(i = 0; i < vlen; i++){
+ assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse3);
+
+}
+#else
+void qa_32fc_32f_multiply_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h
new file mode 100644
index 000000000..fc3b3eeb2
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc
new file mode 100644
index 000000000..64ea65da9
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc
@@ -0,0 +1,83 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_32f_power_32fc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1.5e-3)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE
+void qa_32fc_32f_power_32fc_aligned16::t1() {
+
+ const int vlen = 2046;
+ const int ITERS = 10000;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ int i;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse;
+
+ ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float));
+ ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float));
+
+ random_floats((float*)input, vlen * 2);
+
+ const float power = 3.2;
+
+ printf("32fc_32f_power_32fc_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(i = 0; i < vlen; i++){
+ assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA);
+ }
+
+ free(input);
+ free(result_generic);
+ free(result_sse);
+
+}
+#else
+void qa_32fc_32f_power_32fc_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE */
+
diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h
new file mode 100644
index 000000000..464b7b7cc
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H
+#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc
new file mode 100644
index 000000000..a24382d71
--- /dev/null
+++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_atan2_32f_aligned16.h>
+#include <volk/volk_32fc_atan2_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_atan2_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_atan2_32f_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_atan2_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h
new file mode 100644
index 000000000..9c4dc209a
--- /dev/null
+++ b/volk/lib/qa_32fc_atan2_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc
new file mode 100644
index 000000000..497914e0a
--- /dev/null
+++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc
@@ -0,0 +1,137 @@
+#include <volk/volk.h>
+#include <qa_32fc_conjugate_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse");
+
+ printf("32fc_conjugate_dot_prod_aligned16\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32");
+
+ printf("32fc_conjugate_dot_prod_aligned16\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#else
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h
new file mode 100644
index 000000000..507b1769b
--- /dev/null
+++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..0f5a030f5
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_32f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_generic1[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse1[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_deinterleave_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..78660e6ad
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc
new file mode 100644
index 000000000..6e051afbc
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_64f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32fc_deinterleave_64f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_64f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ double output_generic[vlen] __attribute__ ((aligned (16)));
+ double output_generic1[vlen] __attribute__ ((aligned (16)));
+ double output_sse2[vlen] __attribute__ ((aligned (16)));
+ double output_sse21[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_deinterleave_64f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h
new file mode 100644
index 000000000..f924b9752
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..850518524
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_real_16s_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_deinterleave_real_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..68b80f27d
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..321deb184
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_real_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_deinterleave_real_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..765450bb6
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc
new file mode 100644
index 000000000..aedb2e387
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_64f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32fc_deinterleave_real_64f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_64f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ double output_generic[vlen] __attribute__ ((aligned (16)));
+ double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_deinterleave_real_64f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h
new file mode 100644
index 000000000..3e55fb812
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc
new file mode 100644
index 000000000..bcf9ea954
--- /dev/null
+++ b/volk/lib/qa_32fc_dot_prod_aligned16.cc
@@ -0,0 +1,214 @@
+#include <volk/volk.h>
+#include <qa_32fc_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+
+
+#if LV_HAVE_SSE3
+void qa_32fc_dot_prod_aligned16::t1() {
+
+ const int vlen = 2046;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result_sse3, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result_sse3[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+ printf("32fc_dot_prod_aligned16\n");
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+
+ assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse3);
+
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+void qa_32fc_dot_prod_aligned16::t2() {
+
+ const int vlen = 2046;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result_sse3, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result_sse3[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+ printf("32fc_dot_prod_aligned16\n");
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_32_time: %f\n", total);
+
+ printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+
+ assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse3);
+
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t2() {
+ printf("sse_32 not available... no test performed\n");
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+void qa_32fc_dot_prod_aligned16::t3() {
+
+ const int vlen = 2046;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result_sse3, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result_sse3[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+ printf("32fc_dot_prod_aligned16\n");
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+
+ start = clock();
+ volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64");
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_64_time: %f\n", total);
+
+ printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+
+ assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse3);
+
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t3() {
+ printf("sse_64 not available... no test performed\n");
+}
+
+
+
+#endif
diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h
new file mode 100644
index 000000000..4b360db27
--- /dev/null
+++ b/volk/lib/qa_32fc_dot_prod_aligned16.h
@@ -0,0 +1,20 @@
+#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+ void t2 ();
+ void t3 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..4d83f1639
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target;
+ unsigned int* target_generic;
+ std::complex<float>* src0 ;
+
+
+ unsigned int i_target;
+ target = &i_target;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+ ret = posix_memalign((void**)&src0, 16, vlen << 3);
+
+ random_floats((float*)src0, vlen * 2);
+
+ printf("32fc_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+
+ printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+
+
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
new file mode 100644
index 000000000..a4be1616b
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32fc_magnitude_16s_aligned16.h>
+#include <volk/volk_32fc_magnitude_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_magnitude_16s_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_magnitude_16s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_magnitude_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..ffdf1dd9e
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H
+#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
new file mode 100644
index 000000000..d69ada408
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32fc_magnitude_32f_aligned16.h>
+#include <volk/volk_32fc_magnitude_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_magnitude_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_magnitude_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_magnitude_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..a2881308c
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc
new file mode 100644
index 000000000..e1f7eab3d
--- /dev/null
+++ b/volk/lib/qa_32fc_multiply_aligned16.cc
@@ -0,0 +1,86 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_multiply_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-3)
+
+//test for sse
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE3
+void qa_32fc_multiply_aligned16::t1() {
+
+ const int vlen = 2046;
+ const int ITERS = 100000;
+
+ int i;
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse3;
+
+ ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float));
+ ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float));
+ ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float));
+ ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float));
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+ printf("32fc_multiply_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(i = 0; i < vlen; i++){
+ assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse3);
+
+}
+#else
+void qa_32fc_multiply_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h
new file mode 100644
index 000000000..c8abaa8fe
--- /dev/null
+++ b/volk/lib/qa_32fc_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_multiply_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..83cdf4b15
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ const float scalar = vlen;
+ const float rbw = 1.7;
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_power_spectral_density_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc
new file mode 100644
index 000000000..4d1359068
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectrum_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectrum_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectrum_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectrum_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ const float scalar = vlen;
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+
+ printf("32fc_power_spectrum_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h
new file mode 100644
index 000000000..d991223f3
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc
new file mode 100644
index 000000000..d9ead8495
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_aligned16.cc
@@ -0,0 +1,91 @@
+#include <volk/volk.h>
+#include <qa_32fc_square_dist_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 10000000
+#define VEC_LEN 64
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_square_dist_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_square_dist_aligned16::t1(){
+ int i = 0;
+
+ const int vlen = VEC_LEN;
+ volk_environment_init();
+ int ret;
+
+ float* target;
+ float* target_generic;
+ std::complex<float>* src0 ;
+ std::complex<float>* points;
+
+ ret = posix_memalign((void**)&points, 16, vlen << 3);
+ ret = posix_memalign((void**)&target, 16, vlen << 2);
+ ret = posix_memalign((void**)&target_generic, 16, vlen << 2);
+ ret = posix_memalign((void**)&src0, 16, 8);
+
+ random_floats((float*)points, vlen * 2);
+ random_floats((float*)src0, 2);
+
+ printf("32fc_square_dist_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+ for(; i < vlen; ++i) {
+ //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA);
+ }
+
+ free(target);
+ free(target_generic);
+ free(points);
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h
new file mode 100644
index 000000000..9d365d8b0
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H
+#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc
new file mode 100644
index 000000000..f923d1d5c
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc
@@ -0,0 +1,96 @@
+#include <volk/volk.h>
+#include <qa_32fc_square_dist_scalar_mult_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define ERR_DELTA .0001
+#define NUM_ITERS 10000000
+#define VEC_LEN 64
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_square_dist_scalar_mult_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_square_dist_scalar_mult_aligned16::t1(){
+ int i = 0;
+
+ const int vlen = VEC_LEN;
+
+ volk_environment_init();
+ int ret;
+
+ float* target;
+ float* target_generic;
+ std::complex<float>* src0 ;
+ std::complex<float>* points;
+ float scalar;
+
+ ret = posix_memalign((void**)&points, 16, vlen << 3);
+ ret = posix_memalign((void**)&target, 16, vlen << 2);
+ ret = posix_memalign((void**)&target_generic, 16, vlen << 2);
+ ret = posix_memalign((void**)&src0, 16, 8);
+
+ random_floats((float*)points, vlen * 2);
+ random_floats((float*)src0, 2);
+ random_floats(&scalar, 1);
+
+ printf("32fc_square_dist_scalar_mult_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+ for(i = 0; i < vlen; ++i) {
+ printf("generic: %f, sse3: %f\n", target_generic[i], target[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA);
+ }
+
+ free(target);
+ free(target_generic);
+ free(points);
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h
new file mode 100644
index 000000000..ac4e3c45b
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc
new file mode 100644
index 000000000..72d05cf6f
--- /dev/null
+++ b/volk/lib/qa_32s_and_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_and_aligned16.h>
+#include <volk/volk_32s_and_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32s_and_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_and_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int32_t input0[vlen] __attribute__ ((aligned (16)));
+ int32_t input1[vlen] __attribute__ ((aligned (16)));
+
+ int32_t output0[vlen] __attribute__ ((aligned (16)));
+ int32_t output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+ input1[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+ }
+ printf("32s_and_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h
new file mode 100644
index 000000000..dfcb47c63
--- /dev/null
+++ b/volk/lib/qa_32s_and_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H
+#define INCLUDED_QA_32S_AND_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_and_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32s_and_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..eab3fe016
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_convert_32f_aligned16.h>
+#include <volk/volk_32s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32s_convert_32f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_convert_32f_aligned16::t1() {
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+
+ int32_t input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("32s_convert_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h
new file mode 100644
index 000000000..efd2a2eea
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..0e504cfa1
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_convert_32f_unaligned16.h>
+#include <volk/volk_32s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32s_convert_32f_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_convert_32f_unaligned16::t1() {
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+
+ int32_t input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+ }
+ printf("32s_convert_32f_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..5006f5fd8
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc
new file mode 100644
index 000000000..e09dfb91c
--- /dev/null
+++ b/volk/lib/qa_32s_or_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_or_aligned16.h>
+#include <volk/volk_32s_or_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32s_or_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_or_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int32_t input0[vlen] __attribute__ ((aligned (16)));
+ int32_t input1[vlen] __attribute__ ((aligned (16)));
+
+ int32_t output0[vlen] __attribute__ ((aligned (16)));
+ int32_t output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+ input1[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+ }
+ printf("32s_or_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h
new file mode 100644
index 000000000..9e949eb52
--- /dev/null
+++ b/volk/lib/qa_32s_or_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H
+#define INCLUDED_QA_32S_OR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_or_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32s_or_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */
diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc
new file mode 100644
index 000000000..8b1023876
--- /dev/null
+++ b/volk/lib/qa_32u_byteswap_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32u_byteswap_aligned16.h>
+#include <volk/volk_32u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32u_byteswap_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_byteswap_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100001;
+
+ uint32_t output0[vlen] __attribute__ ((aligned (16)));
+ uint32_t output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+ }
+ memcpy(output01, output0, vlen*sizeof(uint32_t));
+ printf("32u_byteswap_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_byteswap_aligned16_manual(output0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h
new file mode 100644
index 000000000..47bad4c3d
--- /dev/null
+++ b/volk/lib/qa_32u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_byteswap_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..49fcddeb2
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ uint32_t input0 __attribute__ ((aligned (16)));
+
+ uint32_t output0 __attribute__ ((aligned (16)));
+ uint32_t output01 __attribute__ ((aligned (16)));
+
+ input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("32u_popcnt_aligned\n");
+
+ start = clock();
+ uint32_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc
new file mode 100644
index 000000000..0eaebf00a
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_convert_32f_aligned16.h>
+#include <volk/volk_64f_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_convert_32f_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_convert_32f_aligned16::t1() {
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+
+ double input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ }
+ printf("64f_convert_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h
new file mode 100644
index 000000000..95d79f73d
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..dcf94bd27
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_convert_32f_unaligned16.h>
+#include <volk/volk_64f_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_convert_32f_unaligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_convert_32f_unaligned16::t1() {
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+
+ double input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ }
+ printf("64f_convert_32f_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h
new file mode 100644
index 000000000..430327e81
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc
new file mode 100644
index 000000000..41ab078b0
--- /dev/null
+++ b/volk/lib/qa_64f_max_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_max_aligned16.h>
+#include <volk/volk_64f_max_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_max_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_max_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ double input0[vlen] __attribute__ ((aligned (16)));
+ double input1[vlen] __attribute__ ((aligned (16)));
+
+ double output0[vlen] __attribute__ ((aligned (16)));
+ double output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ }
+ printf("64f_max_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h
new file mode 100644
index 000000000..7cbd4d4c1
--- /dev/null
+++ b/volk/lib/qa_64f_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H
+#define INCLUDED_QA_64F_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64f_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc
new file mode 100644
index 000000000..b4664d065
--- /dev/null
+++ b/volk/lib/qa_64f_min_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_min_aligned16.h>
+#include <volk/volk_64f_min_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_min_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_min_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ double input0[vlen] __attribute__ ((aligned (16)));
+ double input1[vlen] __attribute__ ((aligned (16)));
+
+ double output0[vlen] __attribute__ ((aligned (16)));
+ double output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+ }
+ printf("64f_min_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h
new file mode 100644
index 000000000..a0e95395f
--- /dev/null
+++ b/volk/lib/qa_64f_min_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H
+#define INCLUDED_QA_64F_MIN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_min_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64f_min_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc
new file mode 100644
index 000000000..4f5d4d02b
--- /dev/null
+++ b/volk/lib/qa_64u_byteswap_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_64u_byteswap_aligned16.h>
+#include <volk/volk_64u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64u_byteswap_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_byteswap_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100001;
+
+ uint64_t output0[vlen] __attribute__ ((aligned (16)));
+ uint64_t output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+ }
+ memcpy(output01, output0, vlen*sizeof(uint64_t));
+ printf("64u_byteswap_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_byteswap_aligned16_manual(output0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h
new file mode 100644
index 000000000..a4fa0c983
--- /dev/null
+++ b/volk/lib/qa_64u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_byteswap_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..bce9ff6c2
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ uint64_t input0 __attribute__ ((aligned (16)));
+
+ uint64_t output0 __attribute__ ((aligned (16)));
+ uint64_t output01 __attribute__ ((aligned (16)));
+
+ input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("64u_popcnt_aligned\n");
+
+ start = clock();
+ uint64_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc
new file mode 100644
index 000000000..35f08fb81
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_16s_aligned16.h>
+#include <volk/volk_8s_convert_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse4_1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_16s_aligned16::t1() {
+ printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_16s_aligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int8_t input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+ }
+ printf("8s_convert_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h
new file mode 100644
index 000000000..38739fc96
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc
new file mode 100644
index 000000000..bb326f818
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_16s_unaligned16.h>
+#include <volk/volk_8s_convert_16s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse4_1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_16s_unaligned16::t1() {
+ printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_16s_unaligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int8_t input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+ }
+ printf("8s_convert_16s_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h
new file mode 100644
index 000000000..d39fffc35
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..522da0b9d
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_32f_aligned16.h>
+#include <volk/volk_8s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse4.1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_32f_aligned16::t1() {
+ printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_32f_aligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int8_t input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+ }
+ printf("8s_convert_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h
new file mode 100644
index 000000000..7f8401d42
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..ea1fb7c74
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_32f_unaligned16.h>
+#include <volk/volk_8s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse4.1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_32f_unaligned16::t1() {
+ printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_32f_unaligned16::t1() {
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ int8_t input0[vlen+1] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen+1] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen+1] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+ }
+ printf("8s_convert_32f_unaligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%e...%e\n", output_generic[i], output_sse4_1[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..aad2f8c22
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc
new file mode 100644
index 000000000..823e7fe2e
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc
@@ -0,0 +1,67 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_16s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_deinterleave_16s_aligned16::t1() {
+ printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_16s_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse4_11[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse4_11[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..9c99fed70
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..fb580516c
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,134 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_32f_aligned16.h>
+#include <volk/volk_8sc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_generic1[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse1[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+#else
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_generic1[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+ float output_sse1[vlen] __attribute__ ((aligned (16)));
+ float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+ float output_sse14_1[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1_time: %f\n", total);
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max<double>((output_generic[i])*1e-4, 1e-4));
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+ }
+}
+
+
+#endif /* LV_HAVE_SSE4_1 */
diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..63b5fdadb
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..1cc844b52
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_deinterleave_real_16s_aligned16::t1() {
+ printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_16s_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_real_16s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..02050926f
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..10e537cde
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,138 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_real_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+#else
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> *input0;
+
+ float* output_generic;
+ float* output_sse;
+ float* output_sse4_1;
+
+ ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t));
+ ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float));
+ ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0);
+ }
+
+ printf("8sc_deinterleave_real_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+ }
+
+ free(input0);
+ free(output_generic);
+ free(output_sse);
+ free(output_sse4_1);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..93338e488
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc
new file mode 100644
index 000000000..d84df8119
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_8s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_8sc_deinterleave_real_8s_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_8s_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 100000;
+ std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+
+ int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+ int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+ int8_t* loadInput = (int8_t*)input0;
+ for(int i = 0; i < vlen*2; ++i) {
+ loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+ printf("8sc_deinterleave_real_8s_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("ssse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..92fc0dd4a
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc
new file mode 100644
index 000000000..d64eac8ce
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc
@@ -0,0 +1,87 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_multiply_conjugate_16sc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_multiply_conjugate_16sc_aligned16::t1() {
+ printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_multiply_conjugate_16sc_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 2046;
+ const int ITERS = 100000;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<int8_t>* input;
+ std::complex<int8_t>* taps;
+
+ std::complex<int16_t>* result_generic;
+ std::complex<int16_t>* result_sse4_1;
+ int i;
+ int8_t* inputInt8_T;
+ int8_t* tapsInt8_T;
+
+ ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t));
+ ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t));
+ ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t));
+ ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t));
+
+ inputInt8_T = (int8_t*)input;
+ tapsInt8_T = (int8_t*)taps;
+ for(int i = 0; i < vlen*2; ++i) {
+ inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+
+ printf("8sc_multiply_conjugate_16sc_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex<int16_t>*)result_generic, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex<int16_t>*)result_sse4_1, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(i = 0; i < vlen; i++){
+ //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i]));
+
+ assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse4_1);
+
+}
+
+#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h
new file mode 100644
index 000000000..0e78a5eca
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H
+#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc
new file mode 100644
index 000000000..c27f0e0ca
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc
@@ -0,0 +1,87 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_multiply_conjugate_32fc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_multiply_conjugate_32fc_aligned16::t1() {
+ printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_multiply_conjugate_32fc_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ const int vlen = 2046;
+ const int ITERS = 100000;
+
+ volk_environment_init();
+ int ret;
+ clock_t start, end;
+ double total;
+ std::complex<int8_t>* input;
+ std::complex<int8_t>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result_sse4_1;
+ int i;
+ int8_t* inputInt8_T;
+ int8_t* tapsInt8_T;
+
+ ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t));
+ ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t));
+ ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float));
+ ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float));
+
+
+ inputInt8_T = (int8_t*)input;
+ tapsInt8_T = (int8_t*)taps;
+ for(int i = 0; i < vlen*2; ++i) {
+ inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+ }
+
+ printf("8sc_multiply_conjugate_32fc_aligned16\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen);
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4_1_time: %f\n", total);
+
+ for(i = 0; i < vlen; i++){
+ //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i]));
+ assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA);
+ }
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result_sse4_1);
+
+}
+
+#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h
new file mode 100644
index 000000000..eb9ae309c
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H
+#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc
new file mode 100644
index 000000000..c3c27b69b
--- /dev/null
+++ b/volk/lib/qa_volk.cc
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This class gathers together all the test cases for the example
+ * directory into a single test suite. As you create new test cases,
+ * add them here.
+ */
+
+#include <qa_volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <qa_32fc_dot_prod_aligned16.h>
+#include <qa_32fc_square_dist_aligned16.h>
+#include <qa_32fc_square_dist_scalar_mult_aligned16.h>
+#include <qa_32f_sum_of_poly_aligned16.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <qa_32fc_conjugate_dot_prod_aligned16.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <qa_16s_max_star_horizontal_aligned16.h>
+#include <qa_16s_max_star_aligned16.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <qa_32f_add_aligned16.h>
+#include <qa_32f_subtract_aligned16.h>
+#include <qa_32f_max_aligned16.h>
+#include <qa_32f_min_aligned16.h>
+#include <qa_64f_max_aligned16.h>
+#include <qa_64f_min_aligned16.h>
+#include <qa_32s_and_aligned16.h>
+#include <qa_32s_or_aligned16.h>
+#include <qa_32f_dot_prod_aligned16.h>
+#include <qa_32f_dot_prod_unaligned16.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <qa_32fc_32f_multiply_aligned16.h>
+#include <qa_32fc_multiply_aligned16.h>
+#include <qa_32f_divide_aligned16.h>
+#include <qa_32f_multiply_aligned16.h>
+#include <qa_32f_sqrt_aligned16.h>
+#include <qa_8sc_multiply_conjugate_16sc_aligned16.h>
+#include <qa_8sc_multiply_conjugate_32fc_aligned16.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <qa_16u_byteswap_aligned16.h>
+#include <qa_32u_byteswap_aligned16.h>
+#include <qa_64u_byteswap_aligned16.h>
+#include <qa_32f_normalize_aligned16.h>
+#include <qa_16sc_deinterleave_16s_aligned16.h>
+#include <qa_16sc_deinterleave_32f_aligned16.h>
+#include <qa_16sc_deinterleave_real_16s_aligned16.h>
+#include <qa_16sc_deinterleave_real_32f_aligned16.h>
+#include <qa_16sc_deinterleave_real_8s_aligned16.h>
+#include <qa_16sc_magnitude_16s_aligned16.h>
+#include <qa_16sc_magnitude_32f_aligned16.h>
+#include <qa_32fc_deinterleave_32f_aligned16.h>
+#include <qa_32fc_deinterleave_64f_aligned16.h>
+#include <qa_32fc_deinterleave_real_16s_aligned16.h>
+#include <qa_32fc_deinterleave_real_32f_aligned16.h>
+#include <qa_32fc_deinterleave_real_64f_aligned16.h>
+#include <qa_32fc_magnitude_16s_aligned16.h>
+#include <qa_32fc_magnitude_32f_aligned16.h>
+#include <qa_32f_interleave_16sc_aligned16.h>
+#include <qa_32f_interleave_32fc_aligned16.h>
+#include <qa_8sc_deinterleave_16s_aligned16.h>
+#include <qa_8sc_deinterleave_32f_aligned16.h>
+#include <qa_8sc_deinterleave_real_16s_aligned16.h>
+#include <qa_8sc_deinterleave_real_32f_aligned16.h>
+#include <qa_8sc_deinterleave_real_8s_aligned16.h>
+#include <qa_16s_convert_32f_aligned16.h>
+#include <qa_16s_convert_32f_unaligned16.h>
+#include <qa_16s_convert_8s_aligned16.h>
+#include <qa_16s_convert_8s_unaligned16.h>
+#include <qa_32f_convert_16s_aligned16.h>
+#include <qa_32f_convert_16s_unaligned16.h>
+#include <qa_32f_convert_32s_aligned16.h>
+#include <qa_32f_convert_32s_unaligned16.h>
+#include <qa_32f_convert_64f_aligned16.h>
+#include <qa_32f_convert_64f_unaligned16.h>
+#include <qa_32f_convert_8s_aligned16.h>
+#include <qa_32f_convert_8s_unaligned16.h>
+#include <qa_32s_convert_32f_aligned16.h>
+#include <qa_32s_convert_32f_unaligned16.h>
+#include <qa_64f_convert_32f_aligned16.h>
+#include <qa_64f_convert_32f_unaligned16.h>
+#include <qa_8s_convert_16s_aligned16.h>
+#include <qa_8s_convert_16s_unaligned16.h>
+#include <qa_8s_convert_32f_aligned16.h>
+#include <qa_8s_convert_32f_unaligned16.h>
+#include <qa_32fc_32f_power_32fc_aligned16.h>
+#include <qa_32f_power_aligned16.h>
+#include <qa_32fc_atan2_32f_aligned16.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <qa_32fc_power_spectrum_32f_aligned16.h>
+#include <qa_32f_calc_spectral_noise_floor_aligned16.h>
+#include <qa_32f_accumulator_aligned16.h>
+#include <qa_32f_stddev_aligned16.h>
+#include <qa_32f_stddev_and_mean_aligned16.h>
+
+CppUnit::TestSuite *
+qa_volk::suite()
+{
+ CppUnit::TestSuite *s = new CppUnit::TestSuite("volk");
+
+ s->addTest(qa_16s_quad_max_star_aligned16::suite());
+ s->addTest(qa_32fc_dot_prod_aligned16::suite());
+ s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite());
+ s->addTest(qa_32fc_square_dist_aligned16::suite());
+ s->addTest(qa_32f_sum_of_poly_aligned16::suite());
+ s->addTest(qa_32fc_index_max_aligned16::suite());
+ s->addTest(qa_32f_index_max_aligned16::suite());
+ s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite());
+ s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite());
+ s->addTest(qa_16s_branch_4_state_8_aligned16::suite());
+ s->addTest(qa_16s_max_star_horizontal_aligned16::suite());
+ s->addTest(qa_16s_max_star_aligned16::suite());
+ s->addTest(qa_16s_add_quad_aligned16::suite());
+ s->addTest(qa_32f_add_aligned16::suite());
+ s->addTest(qa_32f_subtract_aligned16::suite());
+ s->addTest(qa_32f_max_aligned16::suite());
+ s->addTest(qa_32f_min_aligned16::suite());
+ s->addTest(qa_64f_max_aligned16::suite());
+ s->addTest(qa_64f_min_aligned16::suite());
+ s->addTest(qa_32s_and_aligned16::suite());
+ s->addTest(qa_32s_or_aligned16::suite());
+ s->addTest(qa_32f_dot_prod_aligned16::suite());
+ s->addTest(qa_32f_dot_prod_unaligned16::suite());
+ s->addTest(qa_32f_fm_detect_aligned16::suite());
+ s->addTest(qa_32fc_32f_multiply_aligned16::suite());
+ s->addTest(qa_32fc_multiply_aligned16::suite());
+ s->addTest(qa_32f_divide_aligned16::suite());
+ s->addTest(qa_32f_multiply_aligned16::suite());
+ s->addTest(qa_32f_sqrt_aligned16::suite());
+ s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite());
+ s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite());
+ s->addTest(qa_32u_popcnt_aligned16::suite());
+ s->addTest(qa_64u_popcnt_aligned16::suite());
+ s->addTest(qa_16u_byteswap_aligned16::suite());
+ s->addTest(qa_32u_byteswap_aligned16::suite());
+ s->addTest(qa_64u_byteswap_aligned16::suite());
+ s->addTest(qa_32f_normalize_aligned16::suite());
+ s->addTest(qa_16sc_deinterleave_16s_aligned16::suite());
+ s->addTest(qa_16sc_deinterleave_32f_aligned16::suite());
+ s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite());
+ s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite());
+ s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite());
+ s->addTest(qa_16sc_magnitude_16s_aligned16::suite());
+ s->addTest(qa_16sc_magnitude_32f_aligned16::suite());
+ s->addTest(qa_32fc_deinterleave_32f_aligned16::suite());
+ s->addTest(qa_32fc_deinterleave_64f_aligned16::suite());
+ s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite());
+ s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite());
+ s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite());
+ s->addTest(qa_32fc_magnitude_16s_aligned16::suite());
+ s->addTest(qa_32fc_magnitude_32f_aligned16::suite());
+ s->addTest(qa_32f_interleave_16sc_aligned16::suite());
+ s->addTest(qa_32f_interleave_32fc_aligned16::suite());
+ s->addTest(qa_8sc_deinterleave_16s_aligned16::suite());
+ s->addTest(qa_8sc_deinterleave_32f_aligned16::suite());
+ s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite());
+ s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite());
+ s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite());
+ s->addTest(qa_16s_convert_32f_aligned16::suite());
+ s->addTest(qa_16s_convert_32f_unaligned16::suite());
+ s->addTest(qa_16s_convert_8s_aligned16::suite());
+ s->addTest(qa_16s_convert_8s_unaligned16::suite());
+ s->addTest(qa_32f_convert_16s_aligned16::suite());
+ s->addTest(qa_32f_convert_16s_unaligned16::suite());
+ s->addTest(qa_32f_convert_32s_aligned16::suite());
+ s->addTest(qa_32f_convert_32s_unaligned16::suite());
+ s->addTest(qa_32f_convert_64f_aligned16::suite());
+ s->addTest(qa_32f_convert_64f_unaligned16::suite());
+ s->addTest(qa_32f_convert_8s_aligned16::suite());
+ s->addTest(qa_32f_convert_8s_unaligned16::suite());
+ s->addTest(qa_32s_convert_32f_aligned16::suite());
+ s->addTest(qa_32s_convert_32f_unaligned16::suite());
+ s->addTest(qa_64f_convert_32f_aligned16::suite());
+ s->addTest(qa_64f_convert_32f_unaligned16::suite());
+ s->addTest(qa_8s_convert_16s_aligned16::suite());
+ s->addTest(qa_8s_convert_16s_unaligned16::suite());
+ s->addTest(qa_8s_convert_32f_aligned16::suite());
+ s->addTest(qa_8s_convert_32f_unaligned16::suite());
+ s->addTest(qa_32fc_32f_power_32fc_aligned16::suite());
+ s->addTest(qa_32f_power_aligned16::suite());
+ s->addTest(qa_32fc_atan2_32f_aligned16::suite());
+ s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite());
+ s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite());
+ s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite());
+ s->addTest(qa_32f_accumulator_aligned16::suite());
+ s->addTest(qa_32f_stddev_aligned16::suite());
+ s->addTest(qa_32f_stddev_and_mean_aligned16::suite());
+
+ return s;
+}
diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h
new file mode 100644
index 000000000..43fa7faba
--- /dev/null
+++ b/volk/lib/qa_volk.h
@@ -0,0 +1,36 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Example Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Example Public License for more details.
+ *
+ * You should have received a copy of the GNU Example Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_QA_VOLK_H
+#define INCLUDED_QA_VOLK_H
+
+#include <cppunit/TestSuite.h>
+
+//! collect all the tests for the example directory
+
+class qa_volk {
+ public:
+ //! return suite of tests for all of example directory
+ static CppUnit::TestSuite *suite ();
+};
+
+#endif /* INCLUDED_QA_VOLK_H */
diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc
new file mode 100644
index 000000000..50ac08eab
--- /dev/null
+++ b/volk/lib/test_all.cc
@@ -0,0 +1,82 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2002,2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <cppunit/ui/text/TestRunner.h>
+#include <cppunit/TextTestRunner.h>
+
+#include <qa_volk.h>
+
+#include <cppunit/XmlOutputter.h>
+#include <iostream>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <fstream>
+
+int
+main (int argc, char **argv)
+{
+
+ int opt = 0;
+ std::string xmlOutputFile("");
+
+ while( (opt = getopt(argc, argv, "o:")) != -1){
+ switch(opt){
+ case 'o':
+ if(optarg){
+ xmlOutputFile.assign(optarg);
+ }
+ else{
+ std::cerr << "No xml file output specified for -o" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ break;
+
+ default: /* '?' */
+ fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ }
+
+ CppUnit::TextUi::TestRunner runner;
+
+ runner.addTest (qa_volk::suite ());
+
+ bool was_successful = false;
+ if(!xmlOutputFile.empty()){
+ std::ofstream xmlOutput(xmlOutputFile.c_str());
+ if(xmlOutput.is_open()){
+ runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput));
+
+ was_successful = runner.run("", false, true, false);
+ }
+ xmlOutput.close();
+ }
+ else{
+ was_successful = runner.run ("", false);
+ }
+
+ return was_successful ? 0 : 1;
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..b1a93db26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,13 @@
+#include<volk_rank_archs.h>
+#include<stdio.h>
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) {
+ int i = 2;
+ unsigned int best_val = 0;
+ for(; i < arch_defs[0] + 1; ++i) {
+ if((arch_defs[i]&(!arch)) == 0) {
+ best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+ }
+ }
+ return best_val;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..26b9f7503
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,14 @@
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/