summaryrefslogtreecommitdiff
path: root/volk/lib
diff options
context:
space:
mode:
Diffstat (limited to 'volk/lib')
-rw-r--r--volk/lib/.gitignore23
-rw-r--r--volk/lib/Makefile.am157
-rw-r--r--volk/lib/gcc_x86_cpuid.h178
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.cc89
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.h18
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.cc106
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.h18
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc78
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.h18
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.cc60
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.h18
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.cc61
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.h18
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.cc103
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.cc89
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc64
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc138
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h18
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_utils.cc440
-rw-r--r--volk/lib/qa_utils.h33
-rw-r--r--volk/lib/testqa.cc100
-rw-r--r--volk/lib/volk_rank_archs.c13
-rw-r--r--volk/lib/volk_rank_archs.h14
30 files changed, 2068 insertions, 0 deletions
diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore
new file mode 100644
index 000000000..6a5fde28f
--- /dev/null
+++ b/volk/lib/.gitignore
@@ -0,0 +1,23 @@
+/*.cache
+/*.la
+/*.lo
+/*.pc
+/.deps
+/.la
+/.libs
+/.lo
+/Makefile
+/Makefile.in
+/volk.c
+/volk_cpu_generic.c
+/volk_cpu_powerpc.c
+/volk_cpu_x86.c
+/volk_environment_init.c
+/volk_init.c
+/volk_init.h
+/volk_mktables
+/volk_mktables.c
+/volk_proccpu_sim.c
+/volk_runtime.c
+/test_all
+/testqa
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
new file mode 100644
index 000000000..3e5502369
--- /dev/null
+++ b/volk/lib/Makefile.am
@@ -0,0 +1,157 @@
+#
+# Copyright 2010,2011 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+#FIXME: forcing the top_builddir for distcheck seems like a bit
+# of a hack. Figure out the right way to do this to find built
+# volk_config.h and volk_tables.h
+
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \
+ -I$(top_builddir)/include \
+ $(LV_CXXFLAGS) $(WITH_INCLUDES)
+
+
+# We build 1 library and 1 executable here. The library contains
+# everything except the QA code. The C++ QA code is especially recommended
+# when you have general purpose C or C++ code that may not get
+# thoroughly exercised by building and running a GR block. The
+# executable runs the QA code at "make check" time.
+#
+#
+#
+# N.B., If there's a SWIG generated shared library and associated
+# python code, it will be contained in ../python, not here. (That
+# code is conditionally built depending on the state of the
+# --without-python configure option.) However, the .i should be here
+# next to the .h that it's based on.
+
+
+# list of programs run by "make check" and "make distcheck"
+#TESTS = testqa
+#orc stuff gets built in the ORC directory conditional to ORC being enabled.
+#it gets linked in during the build of libvolk as an added library.
+#there might be a better way to do this.
+
+lib_LTLIBRARIES = \
+ libvolk.la \
+ libvolk_runtime.la
+
+EXTRA_DIST = \
+ volk_mktables.c \
+ volk_rank_archs.h \
+ volk_proccpu_sim.c \
+ gcc_x86_cpuid.h
+
+# ----------------------------------------------------------------
+# The main library
+# ----------------------------------------------------------------
+
+libvolk_runtime_la_SOURCES = \
+ $(platform_CODE) \
+ volk_runtime.c \
+ volk_init.c \
+ volk_rank_archs.c
+
+libvolk_la_SOURCES = \
+ $(platform_CODE) \
+ volk.c \
+ volk_environment_init.c
+
+volk_orc_LDFLAGS = \
+ $(ORC_LDFLAGS) \
+ -lorc-0.4
+
+volk_orc_LIBADD = \
+ ../orc/libvolk_orc.la
+
+if LV_HAVE_ORC
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_la_LIBADD = $(volk_orc_LIBADD)
+else
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_la_LIBADD =
+endif
+
+
+# ----------------------------------------------------------------
+# The QA library. Note libvolk.la in LIBADD
+# ----------------------------------------------------------------
+#libvolk_qa_la_SOURCES = \
+# qa_utils.cc
+
+#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost
+
+#libvolk_qa_la_LIBADD = \
+# libvolk.la \
+# libvolk_runtime.la
+
+# ----------------------------------------------------------------
+# headers that don't get installed
+# ----------------------------------------------------------------
+noinst_HEADERS = \
+ volk_init.h \
+ qa_utils.h
+
+# ----------------------------------------------------------------
+# Our test program
+# ----------------------------------------------------------------
+noinst_PROGRAMS = \
+ testqa
+
+testqa_SOURCES = testqa.cc qa_utils.cc
+testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS)
+testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB)
+if LV_HAVE_ORC
+testqa_LDADD = \
+ libvolk.la \
+ libvolk_runtime.la \
+ ../orc/libvolk_orc.la
+else
+testqa_LDADD = \
+ libvolk.la \
+ libvolk_runtime.la
+endif
+
+distclean-local:
+ rm -f volk.c
+ rm -f volk_cpu_generic.c
+ rm -f volk_cpu_powerpc.c
+ rm -f volk_cpu_x86.c
+ rm -f volk_init.c
+ rm -f volk_init.h
+ rm -f volk_mktables.c
+ rm -f volk_proccpu_sim.c
+ rm -f volk_runtime.c
+ rm -f volk_tables.h
+ rm -f volk_environment_init.c
+#SUBDIRS =
+
+#ifdef BUILD_SSE
+#SUBDIRS += sse
+#elif BUILD_SPU
+#SUBDIRS += spu
+#else
+#SUBDIRS += port
+#endif
+
+
diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h
new file mode 100644
index 000000000..2d0916fb3
--- /dev/null
+++ b/volk/lib/gcc_x86_cpuid.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3 (1 << 0)
+#define bit_PCLMUL (1 << 1)
+#define bit_SSSE3 (1 << 9)
+#define bit_FMA (1 << 12)
+#define bit_CMPXCHG16B (1 << 13)
+#define bit_SSE4_1 (1 << 19)
+#define bit_SSE4_2 (1 << 20)
+#define bit_MOVBE (1 << 22)
+#define bit_POPCNT (1 << 23)
+#define bit_AES (1 << 25)
+#define bit_XSAVE (1 << 26)
+#define bit_OSXSAVE (1 << 27)
+#define bit_AVX (1 << 28)
+
+/* %edx */
+#define bit_CMPXCHG8B (1 << 8)
+#define bit_CMOV (1 << 15)
+#define bit_MMX (1 << 23)
+#define bit_FXSAVE (1 << 24)
+#define bit_SSE (1 << 25)
+#define bit_SSE2 (1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM (1 << 0)
+#define bit_SSE4a (1 << 6)
+#define bit_SSE5 (1 << 11)
+
+/* %edx */
+#define bit_LM (1 << 29)
+#define bit_3DNOWP (1 << 30)
+#define bit_3DNOW (1 << 31)
+
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register. */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction. ext can
+ be either 0x0 or 0x8000000 to return highest supported value for
+ basic or extended cpuid information. Function returns 0 if cpuid
+ is not supported or whatever cpuid returns in eax register. If sig
+ pointer is non-null, then first four bytes of the signature
+ (as found in ebx register) are returned in location pointed by sig. */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+ unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+#if __GNUC__ >= 3
+ /* See if we can use cpuid. On AMD64 we always can. */
+ __asm__ ("pushf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "mov{l}\t{%0, %1|%1, %0}\n\t"
+ "xor{l}\t{%2, %0|%0, %2}\n\t"
+ "push{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+ __asm__ ("pushfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "movl\t%0, %1\n\t"
+ "xorl\t%2, %0\n\t"
+ "pushl\t%0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "popfl\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#endif
+
+ if (!((__eax ^ __ebx) & 0x00200000))
+ return 0;
+#endif
+
+ /* Host supports cpuid. Return highest supported cpuid input value. */
+ __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+ if (__sig)
+ *__sig = __ebx;
+
+ return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+ eax, ebx, ecx and edx registers. The function checks if cpuid is
+ supported and returns 1 for valid cpuid information or 0 for
+ unsupported cpuid level. All pointers are required to be non-null. */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+ unsigned int *__eax, unsigned int *__ebx,
+ unsigned int *__ecx, unsigned int *__edx)
+{
+ unsigned int __ext = __level & 0x80000000;
+
+ if (__get_cpuid_max (__ext, 0) < __level)
+ return 0;
+
+ __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+ return 1;
+}
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..154aa0f17
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3200;
+ const int ITERS = 100000;
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short input1[vlen] __attribute__ ((aligned (16)));
+ short input2[vlen] __attribute__ ((aligned (16)));
+ short input3[vlen] __attribute__ ((aligned (16)));
+ short input4[vlen] __attribute__ ((aligned (16)));
+
+ short output0[vlen] __attribute__ ((aligned (16)));
+ short output1[vlen] __attribute__ ((aligned (16)));
+ short output2[vlen] __attribute__ ((aligned (16)));
+ short output3[vlen] __attribute__ ((aligned (16)));
+ short output01[vlen] __attribute__ ((aligned (16)));
+ short output11[vlen] __attribute__ ((aligned (16)));
+ short output21[vlen] __attribute__ ((aligned (16)));
+ short output31[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ input4[i] = plus4 - minus4;
+
+ }
+ printf("16s_add_quad_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+ CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+ CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..62deffaeb
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ const int num_iters = 1000000;
+ const int vlen = 32;
+
+ static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+ static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+ static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+ static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+ static char* permuters[4] = {permute0, permute1, permute2, permute3};
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ short target[vlen] __attribute__ ((aligned (16)));
+ short target2[vlen] __attribute__ ((aligned (16)));
+ short target3[vlen] __attribute__ ((aligned (16)));
+
+ short src0[vlen] __attribute__ ((aligned (16)));
+ short permute_indexes[vlen] __attribute__ ((aligned (16))) = {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+ short cntl0[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ short cntl1[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ short cntl2[vlen] __attribute__ ((aligned (16))) = {
+ 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+ short cntl3[vlen] __attribute__ ((aligned (16))) = {
+ 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+ short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+
+ }
+
+
+ printf("16s_branch_4_state_8_aligned\n");
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time: %f\n", total);
+
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("branch_4_state_8_time, ssse3: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time, generic: %f\n", total);
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ CPPUNIT_ASSERT(target[i] == target3[i]);
+ }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..819b2256b
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ const int vlen = 64;
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ short target[vlen] __attribute__ ((aligned (16)));
+ short target2[vlen] __attribute__ ((aligned (16)));
+ short src0[vlen] __attribute__ ((aligned (16)));
+ short permute_indexes[vlen] __attribute__ ((aligned (16)));
+ short cntl0[vlen] __attribute__ ((aligned (16)));
+ short cntl1[vlen] __attribute__ ((aligned (16)));
+ short cntl2[vlen] __attribute__ ((aligned (16)));
+ short cntl3[vlen] __attribute__ ((aligned (16)));
+ short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+ permute_indexes[i] = (3 * i)%vlen;
+ cntl0[i] = 0xff;
+ cntl1[i] = 0xff * (i%2);
+ cntl2[i] = 0xff * ((i>>1)%2);
+ cntl3[i] = 0xff * ((i%4) == 3);
+ }
+
+ printf("16s_permute_and_scalar_add_aligned\n");
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("sse2_time: %f\n", total);
+
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..66f8c9afa
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ const int vlen = 34;
+
+ short input0[vlen] __attribute__ ((aligned (16)));
+ short input1[vlen] __attribute__ ((aligned (16)));
+ short input2[vlen] __attribute__ ((aligned (16)));
+ short input3[vlen] __attribute__ ((aligned (16)));
+
+ short output0[vlen] __attribute__ ((aligned (16)));
+ short output1[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = (short) (rand() - (RAND_MAX/2));
+ short plus1 = (short) (rand() - (RAND_MAX/2));
+ short plus2 = (short) (rand() - (RAND_MAX/2));
+ short plus3 = (short) (rand() - (RAND_MAX/2));
+
+ short minus0 = (short) (rand() - (RAND_MAX/2));
+ short minus1 = (short) (rand() - (RAND_MAX/2));
+ short minus2 = (short) (rand() - (RAND_MAX/2));
+ short minus3 = (short) (rand() - (RAND_MAX/2));
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ }
+
+ volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+ volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+ printf("16s_quad_max_star_aligned\n");
+ for(int i = 0; i < vlen; ++i) {
+ printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..592304f83
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ float input0[vlen] __attribute__ ((aligned (16)));
+
+ float output0[vlen] __attribute__ ((aligned (16)));
+ float output01[vlen] __attribute__ ((aligned (16)));
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_fm_detect_aligned\n");
+
+ start = clock();
+ float save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..a1c3d4cd1
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target_sse4_1;
+ unsigned int* target_sse;
+ unsigned int* target_generic;
+ float* src0 ;
+
+
+ unsigned int i_target_sse4_1;
+ target_sse4_1 = &i_target_sse4_1;
+ unsigned int i_target_sse;
+ target_sse = &i_target_sse;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+
+ ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+
+ random_floats((float*)src0, vlen);
+
+ printf("32f_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1 time: %f\n", total);
+
+
+ printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..4d83f1639
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target;
+ unsigned int* target_generic;
+ std::complex<float>* src0 ;
+
+
+ unsigned int i_target;
+ target = &i_target;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+ ret = posix_memalign((void**)&src0, 16, vlen << 3);
+
+ random_floats((float*)src0, vlen * 2);
+
+ printf("32fc_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+
+ printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+
+
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..a3d0955bd
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+
+ float output_generic[vlen] __attribute__ ((aligned (16)));
+ float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+ const float scalar = vlen;
+ const float rbw = 1.7;
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_power_spectral_density_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
new file mode 100644
index 000000000..fefdf06ee
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
@@ -0,0 +1,138 @@
+#include <volk/volk.h>
+#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#else
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
new file mode 100644
index 000000000..f07402403
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..618a82a02
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ uint32_t input0 __attribute__ ((aligned (16)));
+
+ uint32_t output0 __attribute__ ((aligned (16)));
+ uint32_t output01 __attribute__ ((aligned (16)));
+
+ input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("32u_popcnt_aligned\n");
+
+ start = clock();
+ uint32_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..85ef58795
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ uint64_t input0 __attribute__ ((aligned (16)));
+
+ uint64_t output0 __attribute__ ((aligned (16)));
+ uint64_t output01 __attribute__ ((aligned (16)));
+
+ input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("64u_popcnt_aligned\n");
+
+ start = clock();
+ uint64_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
new file mode 100644
index 000000000..b0f63d2b5
--- /dev/null
+++ b/volk/lib/qa_utils.cc
@@ -0,0 +1,440 @@
+#include "qa_utils.h"
+#include <cstring>
+#include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
+#include <boost/tokenizer.hpp>
+//#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include <vector>
+#include <list>
+#include <ctime>
+#include <cmath>
+#include <boost/lexical_cast.hpp>
+//#include <volk/volk_runtime.h>
+#include <volk/volk_registry.h>
+#include <volk/volk.h>
+#include <boost/typeof/typeof.hpp>
+#include <boost/type_traits.hpp>
+
+float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+template <class t>
+void random_floats (t *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+void load_random_data(void *data, volk_type_t type, unsigned int n) {
+ if(type.is_complex) n *= 2;
+ if(type.is_float) {
+ if(type.size == 8) random_floats<double>((double *)data, n);
+ else random_floats<float>((float *)data, n);
+ } else {
+ float int_max = float(uint64_t(2) << (type.size*8));
+ if(type.is_signed) int_max /= 2.0;
+ for(int i=0; i<n; i++) {
+ float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
+ //man i really don't know how to do this in a more clever way, you have to cast down at some point
+ switch(type.size) {
+ case 8:
+ if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+ else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+ break;
+ case 4:
+ if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+ else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+ break;
+ case 2:
+ if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
+ else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
+ break;
+ case 1:
+ if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+ else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+ break;
+ default:
+ throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+ }
+ }
+ }
+}
+
+static std::vector<std::string> get_arch_list(const int archs[]) {
+ std::vector<std::string> archlist;
+ int num_archs = archs[0];
+
+ //there has got to be a way to query these arches
+ for(int i = 0; i < num_archs; i++) {
+ switch(archs[i+1]) {
+ case (1<<LV_GENERIC):
+ archlist.push_back("generic");
+ break;
+ case (1<<LV_ORC):
+ archlist.push_back("orc");
+ break;
+ case (1<<LV_SSE):
+ archlist.push_back("sse");
+ break;
+ case (1<<LV_SSE2):
+ archlist.push_back("sse2");
+ break;
+ case (1<<LV_SSE3):
+ archlist.push_back("sse3");
+ break;
+ case (1<<LV_SSSE3):
+ archlist.push_back("ssse3");
+ break;
+ case (1<<LV_SSE4_1):
+ archlist.push_back("sse4_1");
+ break;
+ case (1<<LV_SSE4_2):
+ archlist.push_back("sse4_2");
+ break;
+ case (1<<LV_SSE4_A):
+ archlist.push_back("sse4_a");
+ break;
+ case (1<<LV_MMX):
+ archlist.push_back("mmx");
+ break;
+ case (1<<LV_AVX):
+ archlist.push_back("avx");
+ break;
+ default:
+ break;
+ }
+ }
+ return archlist;
+}
+
+volk_type_t volk_type_from_string(std::string name) {
+ volk_type_t type;
+ type.is_float = false;
+ type.is_scalar = false;
+ type.is_complex = false;
+ type.is_signed = false;
+ type.size = 0;
+ type.str = name;
+
+ if(name.size() < 2) throw std::string("name too short to be a datatype");
+
+ //is it a scalar?
+ if(name[0] == 's') {
+ type.is_scalar = true;
+ name = name.substr(1, name.size()-1);
+ }
+
+ //get the data size
+ int last_size_pos = name.find_last_of("0123456789");
+ if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
+ //will throw if malformed
+ int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
+
+ assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+ type.size = size/8; //in bytes
+
+ for(int i=last_size_pos+1; i < name.size(); i++) {
+ switch (name[i]) {
+ case 'f':
+ type.is_float = true;
+ break;
+ case 'i':
+ type.is_signed = true;
+ break;
+ case 'c':
+ type.is_complex = true;
+ break;
+ case 'u':
+ type.is_signed = false;
+ break;
+ default:
+ throw;
+ }
+ }
+
+ return type;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::vector<volk_type_t> &outputsig,
+ std::string name) {
+ boost::char_separator<char> sep("_");
+ boost::tokenizer<boost::char_separator<char> > tok(name, sep);
+ std::vector<std::string> toked;
+ tok.assign(name);
+ toked.assign(tok.begin(), tok.end());
+
+ assert(toked[0] == "volk");
+ toked.erase(toked.begin());
+
+ //ok. we're assuming a string in the form
+ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+ std::string fn_name;
+ volk_type_t type;
+ BOOST_FOREACH(std::string token, toked) {
+ try {
+ type = volk_type_from_string(token);
+ if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+
+ if(side == SIDE_INPUT) inputsig.push_back(type);
+ else outputsig.push_back(type);
+ } catch (...){
+ if(token[0] == 'x') { //it's a multiplier
+ if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+ else assert(outputsig.size() > 0);
+ int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+ for(int i=1; i<multiplier; i++) {
+ if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+ else outputsig.push_back(outputsig.back());
+ }
+ }
+ else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+ side = SIDE_NAME;
+ fn_name.append("_");
+ fn_name.append(token);
+ }
+ else if(side == SIDE_OUTPUT) {
+ if(token != toked.back()) throw; //the last token in the name is the alignment
+ }
+ }
+ }
+ //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+ assert(inputsig.size() != 0);
+}
+
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(int i=0; i<vlen; i++) {
+ if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision
+ if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+template <class t>
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(int i=0; i<vlen; i++) {
+ if(abs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+class volk_qa_aligned_mem_pool{
+public:
+ void *get_new(size_t size, size_t alignment = 16){
+ _mems.push_back(std::vector<char>(size + alignment-1, 0));
+ size_t ptr = size_t(&_mems.back().front());
+ return (void *)((ptr + alignment-1) & ~(alignment-1));
+ }
+private: std::list<std::vector<char> > _mems;
+};
+
+bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) {
+ std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
+
+ //first let's get a list of available architectures for the test
+ std::vector<std::string> arch_list = get_arch_list(archs);
+
+ if(arch_list.size() < 2) {
+ std::cout << "no architectures to test" << std::endl;
+ return false;
+ }
+
+ //something that can hang onto memory and cleanup when this function exits
+ volk_qa_aligned_mem_pool mem_pool;
+
+ //now we have to get a function signature by parsing the name
+ std::vector<volk_type_t> inputsig, outputsig;
+ get_signatures_from_name(inputsig, outputsig, name);
+
+ //pull the input scalars into their own vector
+ std::vector<volk_type_t> inputsc;
+ for(int i=0; i<inputsig.size(); i++) {
+ if(inputsig[i].is_scalar) {
+ inputsc.push_back(inputsig[i]);
+ inputsig.erase(inputsig.begin() + i);
+ }
+ }
+
+ //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
+ //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
+ std::vector<void *> inbuffs;
+ BOOST_FOREACH(volk_type_t sig, inputsig) {
+ if(!sig.is_scalar) //we don't make buffers for scalars
+ inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+ }
+ for(int i=0; i<inbuffs.size(); i++) {
+ load_random_data(inbuffs[i], inputsig[i], vlen);
+ }
+
+ //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+ std::vector<std::vector<void *> > test_data;
+ for(int i=0; i<arch_list.size(); i++) {
+ std::vector<void *> arch_buffs;
+ for(int j=0; j<outputsig.size(); j++) {
+ arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+ }
+ for(int j=0; j<inputsig.size(); j++) {
+ arch_buffs.push_back(inbuffs[j]);
+ }
+ test_data.push_back(arch_buffs);
+ }
+
+ std::vector<volk_type_t> both_sigs;
+ both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+ both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+ //now run the test
+ clock_t start, end;
+ for(int i = 0; i < arch_list.size(); i++) {
+ start = clock();
+
+ switch(both_sigs.size()) {
+ case 1:
+ if(inputsc.size() == 0) {
+ run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+ run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+ run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
+ default:
+ throw "no function handler for this signature";
+ break;
+ }
+
+ end = clock();
+ std::cout << arch_list[i] << " completed in " << (double)(end-start)/(double)CLOCKS_PER_SEC << "s" << std::endl;
+ }
+ //and now compare each output to the generic output
+ //first we have to know which output is the generic one, they aren't in order...
+ int generic_offset=0;
+ for(int i=0; i<arch_list.size(); i++)
+ if(arch_list[i] == "generic") generic_offset=i;
+
+ //now compare
+ //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
+
+ bool fail = false;
+ bool fail_global = false;
+ for(int i=0; i<arch_list.size(); i++) {
+ if(i != generic_offset) {
+ for(int j=0; j<both_sigs.size(); j++) {
+ if(both_sigs[j].is_float) {
+ if(both_sigs[j].size == 8) {
+ fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ } else {
+ //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+ switch(both_sigs[j].size) {
+ case 8:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 4:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 2:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 1:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ default:
+ fail=1;
+ }
+ }
+ if(fail) {
+ fail_global = true;
+ std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+ }
+ //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
+ }
+ }
+ }
+
+ return fail_global;
+}
+
+
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
new file mode 100644
index 000000000..1b64bacaa
--- /dev/null
+++ b/volk/lib/qa_utils.h
@@ -0,0 +1,33 @@
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <cstdlib>
+#include <string>
+
+struct volk_type_t {
+ bool is_float;
+ bool is_scalar;
+ bool is_signed;
+ bool is_complex;
+ int size;
+ std::string str;
+};
+
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float *buf, unsigned n);
+
+bool run_volk_tests(const int[], void(*)(), std::string, float, float, int, int);
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter), 0)
+
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+
+#endif //VOLK_QA_UTILS_H
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
new file mode 100644
index 000000000..779bc61eb
--- /dev/null
+++ b/volk/lib/testqa.cc
@@ -0,0 +1,100 @@
+#include "qa_utils.h"
+#include <volk/volk.h>
+#include <volk/volk_registry.h>
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(volk_test_all) {
+ //in order...
+// VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000);
+// VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 0, 0, 2046, 10000);
+// VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 0, 2046, 10000);
+// VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_16u_byteswap_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a16, 1e-4, 0, 2046, 1000);
+ VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1, 32768, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 128, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 2046, 10000);
+// VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 10, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000);
+// VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 32768, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 4, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_32u_byteswap_a16, 0, 0, 2046, 10000);
+// VOLK_RUN_TESTS(volk_32u_popcnt_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_64u_byteswap_a16, 0, 0, 2046, 10000);
+// VOLK_RUN_TESTS(volk_64u_popcnt_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 0, 256, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+ VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
+
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..b1a93db26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,13 @@
+#include<volk_rank_archs.h>
+#include<stdio.h>
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) {
+ int i = 2;
+ unsigned int best_val = 0;
+ for(; i < arch_defs[0] + 1; ++i) {
+ if((arch_defs[i]&(!arch)) == 0) {
+ best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+ }
+ }
+ return best_val;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..26b9f7503
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,14 @@
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/