30 files changed, 2068 insertions, 0 deletions
diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore
new file mode 100644
index 000000000..6a5fde28f
--- /dev/null
+++ b/volk/lib/.gitignore
@@ -0,0 +1,23 @@
+/*.cache
+/*.la
+/*.lo
+/*.pc
+/.deps
+/.la
+/.libs
+/.lo
+/Makefile
+/Makefile.in
+/volk.c
+/volk_cpu_generic.c
+/volk_cpu_powerpc.c
+/volk_cpu_x86.c
+/volk_environment_init.c
+/volk_init.c
+/volk_init.h
+/volk_mktables
+/volk_mktables.c
+/volk_proccpu_sim.c
+/volk_runtime.c
+/test_all
+/testqa
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
new file mode 100644
index 000000000..3e5502369
--- /dev/null
+++ b/volk/lib/Makefile.am
@@ -0,0 +1,157 @@
+#
+# Copyright 2010,2011 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+#FIXME: forcing the top_builddir for distcheck seems like a bit
+# of a hack. Figure out the right way to do this to find built
+# volk_config.h and volk_tables.h
+
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \
+	-I$(top_builddir)/include \
+	$(LV_CXXFLAGS) $(WITH_INCLUDES)
+
+
+# We build 1 library and 1 executable here.  The library contains
+# everything except the QA code. The C++ QA code is especially recommended
+# when you have general purpose C or C++ code that may not get
+# thoroughly exercised by building and running a GR block.  The
+# executable runs the QA code at "make check" time.
+#
+#
+#
+# N.B., If there's a SWIG generated shared library and associated
+# python code, it will be contained in ../python, not here.  (That
+# code is conditionally built depending on the state of the
+# --without-python configure option.)  However, the .i should be here
+# next to the .h that it's based on.
+
+
+# list of programs run by "make check" and "make distcheck"
+#TESTS = testqa
+#orc stuff gets built in the ORC directory conditional to ORC being enabled.
+#it gets linked in during the build of libvolk as an added library.
+#there might be a better way to do this.
+
+lib_LTLIBRARIES = \
+	libvolk.la \
+	libvolk_runtime.la
+
+EXTRA_DIST = \
+	volk_mktables.c		\
+	volk_rank_archs.h 	\
+	volk_proccpu_sim.c	\
+	gcc_x86_cpuid.h
+
+# ----------------------------------------------------------------
+#                      The main library
+# ----------------------------------------------------------------
+
+libvolk_runtime_la_SOURCES = 	\
+	$(platform_CODE) 	\
+	volk_runtime.c		\
+	volk_init.c 		\
+	volk_rank_archs.c	
+
+libvolk_la_SOURCES = 		\
+	$(platform_CODE) 	\
+	volk.c 			\
+	volk_environment_init.c
+
+volk_orc_LDFLAGS = \
+	$(ORC_LDFLAGS) \
+	-lorc-0.4
+
+volk_orc_LIBADD = \
+	../orc/libvolk_orc.la
+
+if LV_HAVE_ORC
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_la_LIBADD = $(volk_orc_LIBADD)
+else
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_la_LIBADD =
+endif
+
+
+# ----------------------------------------------------------------
+#        The QA library.  Note libvolk.la in LIBADD
+# ----------------------------------------------------------------
+#libvolk_qa_la_SOURCES = \
+#	qa_utils.cc
+
+#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost
+
+#libvolk_qa_la_LIBADD = \
+#	libvolk.la \
+#	libvolk_runtime.la
+
+# ----------------------------------------------------------------
+# headers that don't get installed
+# ----------------------------------------------------------------
+noinst_HEADERS = \
+	volk_init.h \
+	qa_utils.h
+
+# ----------------------------------------------------------------
+# Our test program
+# ----------------------------------------------------------------
+noinst_PROGRAMS = \
+	testqa
+
+testqa_SOURCES = testqa.cc qa_utils.cc
+testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS)
+testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB)
+if LV_HAVE_ORC
+testqa_LDADD  = \
+	libvolk.la \
+	libvolk_runtime.la \
+	../orc/libvolk_orc.la
+else 
+testqa_LDADD  = \
+	libvolk.la \
+	libvolk_runtime.la
+endif
+
+distclean-local: 
+	rm -f volk.c
+	rm -f volk_cpu_generic.c
+	rm -f volk_cpu_powerpc.c
+	rm -f volk_cpu_x86.c
+	rm -f volk_init.c
+	rm -f volk_init.h
+	rm -f volk_mktables.c
+	rm -f volk_proccpu_sim.c
+	rm -f volk_runtime.c
+	rm -f volk_tables.h
+	rm -f volk_environment_init.c
+#SUBDIRS = 
+
+#ifdef BUILD_SSE
+#SUBDIRS += sse
+#elif BUILD_SPU
+#SUBDIRS += spu
+#else
+#SUBDIRS += port
+#endif
+
+
diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h
new file mode 100644
index 000000000..2d0916fb3
--- /dev/null
+++ b/volk/lib/gcc_x86_cpuid.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ * 
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ * 
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3	(1 << 0)
+#define bit_PCLMUL	(1 << 1)
+#define bit_SSSE3	(1 << 9)
+#define bit_FMA		(1 << 12)
+#define bit_CMPXCHG16B	(1 << 13)
+#define bit_SSE4_1	(1 << 19)
+#define bit_SSE4_2	(1 << 20)
+#define bit_MOVBE	(1 << 22)
+#define bit_POPCNT	(1 << 23)
+#define bit_AES		(1 << 25)
+#define bit_XSAVE	(1 << 26)
+#define bit_OSXSAVE	(1 << 27)
+#define bit_AVX		(1 << 28)
+
+/* %edx */
+#define bit_CMPXCHG8B	(1 << 8)
+#define bit_CMOV	(1 << 15)
+#define bit_MMX		(1 << 23)
+#define bit_FXSAVE	(1 << 24)
+#define bit_SSE		(1 << 25)
+#define bit_SSE2	(1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM	(1 << 0)
+#define bit_SSE4a	(1 << 6)
+#define bit_SSE5	(1 << 11)
+
+/* %edx */
+#define bit_LM		(1 << 29)
+#define bit_3DNOWP	(1 << 30)
+#define bit_3DNOW	(1 << 31)
+
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register.  */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction.  ext can
+   be either 0x0 or 0x8000000 to return highest supported value for
+   basic or extended cpuid information.  Function returns 0 if cpuid
+   is not supported or whatever cpuid returns in eax register.  If sig
+   pointer is non-null, then first four bytes of the signature
+   (as found in ebx register) are returned in location pointed by sig.  */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+  unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+#if __GNUC__ >= 3
+  /* See if we can use cpuid.  On AMD64 we always can.  */
+  __asm__ ("pushf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "mov{l}\t{%0, %1|%1, %0}\n\t"
+	   "xor{l}\t{%2, %0|%0, %2}\n\t"
+	   "push{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+  __asm__ ("pushfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "movl\t%0, %1\n\t"
+	   "xorl\t%2, %0\n\t"
+	   "pushl\t%0\n\t"
+	   "popfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "popfl\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#endif
+
+  if (!((__eax ^ __ebx) & 0x00200000))
+    return 0;
+#endif
+
+  /* Host supports cpuid.  Return highest supported cpuid input value.  */
+  __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+  if (__sig)
+    *__sig = __ebx;
+
+  return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+   eax, ebx, ecx and edx registers.  The function checks if cpuid is
+   supported and returns 1 for valid cpuid information or 0 for
+   unsupported cpuid level.  All pointers are required to be non-null.  */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+	     unsigned int *__eax, unsigned int *__ebx,
+	     unsigned int *__ecx, unsigned int *__edx)
+{
+  unsigned int __ext = __level & 0x80000000;
+
+  if (__get_cpuid_max (__ext, 0) < __level)
+    return 0;
+
+  __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..154aa0f17
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3200;
+  const int ITERS = 100000;
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short input1[vlen] __attribute__ ((aligned (16)));
+  short input2[vlen] __attribute__ ((aligned (16)));
+  short input3[vlen] __attribute__ ((aligned (16)));
+  short input4[vlen] __attribute__ ((aligned (16)));
+  
+  short output0[vlen] __attribute__ ((aligned (16)));
+  short output1[vlen] __attribute__ ((aligned (16)));
+  short output2[vlen] __attribute__ ((aligned (16)));
+  short output3[vlen] __attribute__ ((aligned (16)));
+  short output01[vlen] __attribute__ ((aligned (16)));
+  short output11[vlen] __attribute__ ((aligned (16)));
+  short output21[vlen] __attribute__ ((aligned (16)));
+  short output31[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    
+    input0[i] = plus0 - minus0;
+    input1[i] = plus1 - minus1;
+    input2[i] = plus2 - minus2;
+    input3[i] = plus3 - minus3;
+    input4[i] = plus4 - minus4;
+    
+  }
+  printf("16s_add_quad_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+    CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+    CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..62deffaeb
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+  const int num_iters = 1000000;
+  const int vlen = 32;
+
+  static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+  static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+  static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+  static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+  static char* permuters[4] = {permute0, permute1, permute2, permute3};
+  
+  unsigned int num_bytes = vlen << 1;
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  
+  short target[vlen] __attribute__ ((aligned (16)));
+  short target2[vlen] __attribute__ ((aligned (16)));
+  short target3[vlen] __attribute__ ((aligned (16)));
+  
+  short src0[vlen] __attribute__ ((aligned (16)));
+  short permute_indexes[vlen] __attribute__ ((aligned (16))) =  {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+  short cntl0[vlen] __attribute__ ((aligned (16))) = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+  short cntl1[vlen] __attribute__ ((aligned (16))) = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+  short cntl2[vlen] __attribute__ ((aligned (16))) = {
+    0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+  short cntl3[vlen] __attribute__ ((aligned (16))) =  {
+    0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+  short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+  
+  
+
+  for(int i = 0; i < vlen; ++i) {
+    src0[i] = i;
+    
+  }
+  
+
+  printf("16s_branch_4_state_8_aligned\n");
+  
+  
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("permute_and_scalar_add_time: %f\n", total);
+  
+  
+
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+  }
+  end = clock();
+
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("branch_4_state_8_time, ssse3: %f\n", total);
+  
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("permute_and_scalar_add_time, generic: %f\n", total);
+  
+  
+  
+  for(int i = 0; i < vlen; ++i) {
+    printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT(target[i] == target2[i]);
+    CPPUNIT_ASSERT(target[i] == target3[i]);
+  }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..819b2256b
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+  const int vlen = 64;
+  
+  unsigned int num_bytes = vlen << 1;
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  
+  short target[vlen] __attribute__ ((aligned (16)));
+  short target2[vlen] __attribute__ ((aligned (16)));
+  short src0[vlen] __attribute__ ((aligned (16)));
+  short permute_indexes[vlen] __attribute__ ((aligned (16)));
+  short cntl0[vlen] __attribute__ ((aligned (16)));
+  short cntl1[vlen] __attribute__ ((aligned (16)));
+  short cntl2[vlen] __attribute__ ((aligned (16)));
+  short cntl3[vlen] __attribute__ ((aligned (16)));
+  short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+  for(int i = 0; i < vlen; ++i) {
+    src0[i] = i;
+    permute_indexes[i] = (3 * i)%vlen;
+    cntl0[i] = 0xff;
+    cntl1[i] = 0xff * (i%2);
+    cntl2[i] = 0xff * ((i>>1)%2);
+    cntl3[i] = 0xff * ((i%4) == 3);
+  }
+
+  printf("16s_permute_and_scalar_add_aligned\n");
+  
+  start = clock();
+  for(int i = 0; i < 100000; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+  }
+  end = clock();
+
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int i = 0; i < 100000; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("sse2_time: %f\n", total);
+  
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT(target[i] == target2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..66f8c9afa
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+  const int vlen = 34;
+  
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short input1[vlen] __attribute__ ((aligned (16)));
+  short input2[vlen] __attribute__ ((aligned (16)));
+  short input3[vlen] __attribute__ ((aligned (16)));
+
+  short output0[vlen] __attribute__ ((aligned (16)));
+  short output1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = (short) (rand() - (RAND_MAX/2));
+    short plus1 = (short) (rand() - (RAND_MAX/2));
+    short plus2 = (short) (rand() - (RAND_MAX/2));
+    short plus3 = (short) (rand() - (RAND_MAX/2));
+
+    short minus0 = (short) (rand() - (RAND_MAX/2));
+    short minus1 = (short) (rand() - (RAND_MAX/2));
+    short minus2 = (short) (rand() - (RAND_MAX/2));
+    short minus3 = (short) (rand() - (RAND_MAX/2));
+
+    input0[i] = plus0 - minus0;
+    input1[i] = plus1 - minus1;
+    input2[i] = plus2 - minus2;
+    input3[i] = plus3 - minus3;
+  }
+
+  volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+  volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+  printf("16s_quad_max_star_aligned\n");
+  for(int i = 0; i < vlen; ++i) {
+    printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..592304f83
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_fm_detect_aligned\n");
+
+  start = clock();
+  float save = 0.1;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  save = 0.1;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..a1c3d4cd1
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+ 
+  const int vlen = VEC_LEN;
+
+  
+  volk_runtime_init();
+  
+  volk_environment_init();
+  int ret;
+
+  unsigned int* target_sse4_1;
+  unsigned int* target_sse;
+  unsigned int* target_generic;
+  float* src0 ;
+  
+  
+  unsigned int i_target_sse4_1;
+  target_sse4_1 = &i_target_sse4_1;
+  unsigned int i_target_sse;
+  target_sse = &i_target_sse;
+  unsigned int i_target_generic;
+  target_generic = &i_target_generic;
+
+  ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+  
+  random_floats((float*)src0, vlen);
+  
+  printf("32f_index_max_aligned16\n");
+
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse time: %f\n", total);
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1 time: %f\n", total);
+  
+  
+  printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+  CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+  CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+  
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..4d83f1639
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+ 
+  const int vlen = VEC_LEN;
+  
+  volk_environment_init();
+  int ret;
+  
+  unsigned int* target;
+  unsigned int* target_generic;
+  std::complex<float>* src0 ;
+  
+  
+  unsigned int i_target;
+  target = &i_target;
+  unsigned int i_target_generic;
+  target_generic = &i_target_generic;
+  ret = posix_memalign((void**)&src0, 16, vlen << 3);
+  
+  random_floats((float*)src0, vlen * 2);
+  
+  printf("32fc_index_max_aligned16\n");
+
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+  volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3 time: %f\n", total);
+
+  
+  
+  
+  printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+  
+
+  
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..a3d0955bd
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  const float scalar = vlen;
+  const float rbw = 1.7;
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_power_spectral_density_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
new file mode 100644
index 000000000..fefdf06ee
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
@@ -0,0 +1,138 @@
+#include <volk/volk.h>
+#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+  const int vlen = 789743;
+
+  volk_environment_init();
+  int ret;
+
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  
+
+  volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8,  "generic");
+
+  
+  volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse");
+
+  printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+  printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+  assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result);
+  
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+  const int vlen = 789743;
+  
+  volk_environment_init();
+  int ret;
+
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  
+
+  volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8,  "generic");
+
+  
+  volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32");
+
+  printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+  printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+  assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result);
+  
+}
+
+
+#else
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
new file mode 100644
index 000000000..f07402403
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..618a82a02
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+  printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+
+  const int ITERS = 10000000;
+  uint32_t input0 __attribute__ ((aligned (16)));
+  
+  uint32_t output0 __attribute__ ((aligned (16)));
+  uint32_t output01 __attribute__ ((aligned (16)));
+
+    input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+    output0 = 0;
+    output01 = 0;
+
+  printf("32u_popcnt_aligned\n");
+
+  start = clock();
+  uint32_t ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+    output0 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+    output01 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.2_time: %f\n", total);
+
+  
+  CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..85ef58795
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+  printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+
+  const int ITERS = 10000000;
+  uint64_t input0 __attribute__ ((aligned (16)));
+  
+  uint64_t output0 __attribute__ ((aligned (16)));
+  uint64_t output01 __attribute__ ((aligned (16)));
+
+    input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+    output0 = 0;
+    output01 = 0;
+
+  printf("64u_popcnt_aligned\n");
+
+  start = clock();
+  uint64_t ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+    output0 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+    output01 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.2_time: %f\n", total);
+
+  
+  CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
new file mode 100644
index 000000000..b0f63d2b5
--- /dev/null
+++ b/volk/lib/qa_utils.cc
@@ -0,0 +1,440 @@
+#include "qa_utils.h"
+#include <cstring>
+#include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
+#include <boost/tokenizer.hpp>
+//#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include <vector>
+#include <list>
+#include <ctime>
+#include <cmath>
+#include <boost/lexical_cast.hpp>
+//#include <volk/volk_runtime.h>
+#include <volk/volk_registry.h>
+#include <volk/volk.h>
+#include <boost/typeof/typeof.hpp>
+#include <boost/type_traits.hpp>
+
+float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+template <class t>
+void random_floats (t *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+void load_random_data(void *data, volk_type_t type, unsigned int n) {
+    if(type.is_complex) n *= 2;
+    if(type.is_float) {
+        if(type.size == 8) random_floats<double>((double *)data, n);
+        else random_floats<float>((float *)data, n);
+    } else {
+        float int_max = float(uint64_t(2) << (type.size*8));
+        if(type.is_signed) int_max /= 2.0;
+        for(int i=0; i<n; i++) {
+            float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
+            //man i really don't know how to do this in a more clever way, you have to cast down at some point
+            switch(type.size) {
+            case 8:
+                if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+                else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+            break;
+            case 4:
+                if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+                else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+            break;           
+            case 2:
+                if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
+                else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
+            break;
+            case 1:
+                if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+                else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+            break;
+            default:
+                throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+            }
+        }
+    }
+}
+
+static std::vector<std::string> get_arch_list(const int archs[]) {
+    std::vector<std::string> archlist;
+    int num_archs = archs[0];
+    
+    //there has got to be a way to query these arches
+    for(int i = 0; i < num_archs; i++) {
+        switch(archs[i+1]) {
+        case (1<<LV_GENERIC):
+            archlist.push_back("generic");
+            break;
+        case (1<<LV_ORC):
+            archlist.push_back("orc");
+            break;
+        case (1<<LV_SSE):
+            archlist.push_back("sse");
+            break;
+        case (1<<LV_SSE2):
+            archlist.push_back("sse2");
+            break;
+        case (1<<LV_SSE3):
+            archlist.push_back("sse3");
+            break;
+        case (1<<LV_SSSE3):
+            archlist.push_back("ssse3");
+            break;
+        case (1<<LV_SSE4_1):
+            archlist.push_back("sse4_1");
+            break;
+        case (1<<LV_SSE4_2):
+            archlist.push_back("sse4_2");
+            break;
+        case (1<<LV_SSE4_A):
+            archlist.push_back("sse4_a");
+            break;
+        case (1<<LV_MMX):
+            archlist.push_back("mmx");
+            break;
+        case (1<<LV_AVX):
+            archlist.push_back("avx");
+            break;
+        default:
+            break;
+        }
+    }
+    return archlist;
+}
+
+volk_type_t volk_type_from_string(std::string name) {
+    volk_type_t type;
+    type.is_float = false;
+    type.is_scalar = false;
+    type.is_complex = false;
+    type.is_signed = false;
+    type.size = 0;
+    type.str = name;
+    
+    if(name.size() < 2) throw std::string("name too short to be a datatype");
+    
+    //is it a scalar?
+    if(name[0] == 's') { 
+        type.is_scalar = true;
+        name = name.substr(1, name.size()-1);
+    }
+    
+    //get the data size
+    int last_size_pos = name.find_last_of("0123456789");
+    if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
+    //will throw if malformed
+    int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
+
+    assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+    type.size = size/8; //in bytes
+    
+    for(int i=last_size_pos+1; i < name.size(); i++) {
+        switch (name[i]) {
+        case 'f':
+            type.is_float = true;
+            break;
+        case 'i':
+            type.is_signed = true;
+            break;
+        case 'c':
+            type.is_complex = true;
+            break;
+        case 'u':
+            type.is_signed = false;
+            break;
+        default:
+            throw;
+        }
+    }
+    
+    return type;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, 
+                                   std::vector<volk_type_t> &outputsig, 
+                                   std::string name) {
+    boost::char_separator<char> sep("_");
+    boost::tokenizer<boost::char_separator<char> > tok(name, sep);
+    std::vector<std::string> toked;
+    tok.assign(name);
+    toked.assign(tok.begin(), tok.end());
+    
+    assert(toked[0] == "volk");
+    toked.erase(toked.begin());
+
+    //ok. we're assuming a string in the form
+    //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+    enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+    std::string fn_name;
+    volk_type_t type;
+    BOOST_FOREACH(std::string token, toked) {
+        try {
+            type = volk_type_from_string(token);
+            if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+            
+            if(side == SIDE_INPUT) inputsig.push_back(type);
+            else outputsig.push_back(type);
+        } catch (...){
+            if(token[0] == 'x') { //it's a multiplier
+                if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+                else assert(outputsig.size() > 0);
+                int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+                for(int i=1; i<multiplier; i++) {
+                    if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+                    else outputsig.push_back(outputsig.back());
+                }
+            }
+            else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+                side = SIDE_NAME;
+                fn_name.append("_");
+                fn_name.append(token);
+            } 
+            else if(side == SIDE_OUTPUT) {
+                if(token != toked.back()) throw; //the last token in the name is the alignment
+            }
+        }
+    }
+    //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+    assert(inputsig.size() != 0);
+}
+
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+    bool fail = false;
+    int print_max_errs = 10;
+    for(int i=0; i<vlen; i++) {
+        if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision
+        if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+            }
+        }
+    }
+    
+    return fail;
+}
+
+template <class t>
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
+    bool fail = false;
+    int print_max_errs = 10;
+    for(int i=0; i<vlen; i++) {
+        if(abs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
+            }
+        }
+    }
+    
+    return fail;
+}
+
+class volk_qa_aligned_mem_pool{
+public:
+    void *get_new(size_t size, size_t alignment = 16){
+        _mems.push_back(std::vector<char>(size + alignment-1, 0));
+        size_t ptr = size_t(&_mems.back().front());
+        return (void *)((ptr + alignment-1) & ~(alignment-1));
+    }
+private: std::list<std::vector<char> > _mems;
+};
+
+bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) {
+    std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
+    
+    //first let's get a list of available architectures for the test
+    std::vector<std::string> arch_list = get_arch_list(archs);
+    
+    if(arch_list.size() < 2) {
+        std::cout << "no architectures to test" << std::endl;
+        return false;
+    }
+
+    //something that can hang onto memory and cleanup when this function exits
+    volk_qa_aligned_mem_pool mem_pool;
+
+    //now we have to get a function signature by parsing the name
+    std::vector<volk_type_t> inputsig, outputsig;
+    get_signatures_from_name(inputsig, outputsig, name);
+    
+    //pull the input scalars into their own vector
+    std::vector<volk_type_t> inputsc;
+    for(int i=0; i<inputsig.size(); i++) {
+        if(inputsig[i].is_scalar) {
+            inputsc.push_back(inputsig[i]);
+            inputsig.erase(inputsig.begin() + i);
+        }
+    }
+
+    //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
+    //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
+    std::vector<void *> inbuffs;
+    BOOST_FOREACH(volk_type_t sig, inputsig) {
+        if(!sig.is_scalar) //we don't make buffers for scalars
+          inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+    }
+    for(int i=0; i<inbuffs.size(); i++) {
+        load_random_data(inbuffs[i], inputsig[i], vlen);
+    }
+    
+    //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+    std::vector<std::vector<void *> > test_data;
+    for(int i=0; i<arch_list.size(); i++) {
+        std::vector<void *> arch_buffs;
+        for(int j=0; j<outputsig.size(); j++) {
+            arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+        }
+        for(int j=0; j<inputsig.size(); j++) {
+            arch_buffs.push_back(inbuffs[j]);
+        }
+        test_data.push_back(arch_buffs);
+    }
+    
+    std::vector<volk_type_t> both_sigs;
+    both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+    both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+    //now run the test
+    clock_t start, end;
+    for(int i = 0; i < arch_list.size(); i++) {
+        start = clock();
+
+        switch(both_sigs.size()) {
+            case 1:
+                if(inputsc.size() == 0) {
+                    run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); 
+                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+                } else throw "unsupported 1 arg function >1 scalars";
+                break;
+            case 2:
+                if(inputsc.size() == 0) {
+                    run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+                } else throw "unsupported 2 arg function >1 scalars";
+                break;
+            case 3:
+                if(inputsc.size() == 0) {
+                    run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+                } else throw "unsupported 3 arg function >1 scalars";
+                break;
+            case 4:
+                run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+                break;
+            default:
+                throw "no function handler for this signature";
+                break;
+        }
+        
+        end = clock();
+        std::cout << arch_list[i] << " completed in " << (double)(end-start)/(double)CLOCKS_PER_SEC << "s" << std::endl;
+    }
+    //and now compare each output to the generic output
+    //first we have to know which output is the generic one, they aren't in order...
+    int generic_offset=0;
+    for(int i=0; i<arch_list.size(); i++) 
+        if(arch_list[i] == "generic") generic_offset=i;
+
+    //now compare
+    //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
+    
+    bool fail = false;
+    bool fail_global = false;
+    for(int i=0; i<arch_list.size(); i++) {
+        if(i != generic_offset) {
+            for(int j=0; j<both_sigs.size(); j++) {
+                if(both_sigs[j].is_float) {
+                    if(both_sigs[j].size == 8) {
+                        fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                    } else {
+                        fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                    }
+                } else {
+                    //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+                    switch(both_sigs[j].size) {
+                    case 8:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 4:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 2:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 1:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    default:
+                        fail=1;
+                    }
+                }
+                if(fail) {
+                    fail_global = true;
+                    std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+                }
+                //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
+            }
+        }
+    }
+
+    return fail_global;
+}
+
+
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
new file mode 100644
index 000000000..1b64bacaa
--- /dev/null
+++ b/volk/lib/qa_utils.h
@@ -0,0 +1,33 @@
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <cstdlib>
+#include <string>
+
+struct volk_type_t {
+    bool is_float;
+    bool is_scalar;
+    bool is_signed;
+    bool is_complex;
+    int size;
+    std::string str;
+};
+
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float *buf, unsigned n);
+
+bool run_volk_tests(const int[], void(*)(), std::string, float, float, int, int);
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter), 0)
+
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+
+#endif //VOLK_QA_UTILS_H
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
new file mode 100644
index 000000000..779bc61eb
--- /dev/null
+++ b/volk/lib/testqa.cc
@@ -0,0 +1,100 @@
+#include "qa_utils.h"
+#include <volk/volk.h>
+#include <volk/volk_registry.h>
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(volk_test_all) {    
+    //in order...
+//    VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16u_byteswap_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a16, 1e-4, 0, 2046, 1000);
+    VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 128, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 10, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32u_byteswap_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32u_popcnt_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64u_byteswap_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_64u_popcnt_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 0, 256, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
+
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..b1a93db26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,13 @@
+#include<volk_rank_archs.h>
+#include<stdio.h>
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) {
+  int i = 2;
+  unsigned int best_val = 0;
+  for(; i < arch_defs[0] + 1; ++i) {
+    if((arch_defs[i]&(!arch)) == 0) {
+      best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+    }
+  }
+  return best_val;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..26b9f7503
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,14 @@
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch);
+ 
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/