summaryrefslogtreecommitdiff
path: root/volk/lib
diff options
context:
space:
mode:
Diffstat (limited to 'volk/lib')
-rw-r--r--volk/lib/CMakeLists.txt352
-rw-r--r--volk/lib/gcc_x86_cpuid.h188
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.cc89
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.h18
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.cc106
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.h18
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc78
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.h18
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.cc60
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.h18
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.cc61
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.h18
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.cc103
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.cc89
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc64
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc138
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h18
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_utils.cc477
-rw-r--r--volk/lib/qa_utils.h41
-rw-r--r--volk/lib/testqa.cc90
-rw-r--r--volk/lib/volk_prefs.c50
-rw-r--r--volk/lib/volk_rank_archs.c112
-rw-r--r--volk/lib/volk_rank_archs.h50
30 files changed, 2470 insertions, 0 deletions
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
new file mode 100644
index 000000000..68fadc35b
--- /dev/null
+++ b/volk/lib/CMakeLists.txt
@@ -0,0 +1,352 @@
+#
+# Copyright 2011-2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# header file detection
+########################################################################
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+if(HAVE_CPUID_H)
+ add_definitions(-DHAVE_CPUID_H)
+endif()
+
+CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
+if(HAVE_INTRIN_H)
+ add_definitions(-DHAVE_INTRIN_H)
+endif()
+
+CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
+if(HAVE_FENV_H)
+ add_definitions(-DHAVE_FENV_H)
+endif()
+
+CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
+if(HAVE_DLFCN_H)
+ add_definitions(-DHAVE_DLFCN_H)
+ list(APPEND volk_libraries ${CMAKE_DL_LIBS})
+endif()
+
+########################################################################
+# Setup the compiler name
+########################################################################
+set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
+if(MSVC) #its not set otherwise
+ set(COMPILER_NAME MSVC)
+endif()
+
+message(STATUS "Compiler name: ${COMPILER_NAME}")
+
+if(NOT DEFINED COMPILER_NAME)
+ message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
+endif()
+
+########################################################################
+# Special clang flag so flag checks can fail
+########################################################################
+if(COMPILER_NAME MATCHES "GNU")
+ include(CheckCXXCompilerFlag)
+ CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ if(HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument")
+ endif()
+endif()
+
+########################################################################
+# detect x86 flavor of CPU
+########################################################################
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$")
+ message(STATUS "x86* CPU detected")
+ set(CPU_IS_x86 TRUE)
+endif()
+
+########################################################################
+# determine passing architectures based on compile flag tests
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "arch_flags" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+macro(check_arch arch_name)
+ set(flags ${ARGN})
+ set(have_${arch_name} TRUE)
+ foreach(flag ${flags})
+ include(CheckCXXCompilerFlag)
+ set(have_flag have${flag})
+ execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary)
+ COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
+ OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(VOLK_FLAG_CHECK_FLAGS)
+ set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
+ endif()
+ CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
+ unset(CMAKE_REQUIRED_FLAGS)
+ if (NOT ${have_flag})
+ set(have_${arch_name} FALSE)
+ endif()
+ endforeach(flag)
+ if (have_${arch_name})
+ list(APPEND available_archs ${arch_name})
+ endif()
+endmacro(check_arch)
+
+foreach(line ${arch_flag_lines})
+ string(REGEX REPLACE "," ";" arch_flags ${line})
+ check_arch(${arch_flags})
+endforeach(line)
+
+macro(OVERRULE_ARCH arch reason)
+ message(STATUS "${reason}, Overruled arch ${arch}")
+ list(REMOVE_ITEM available_archs ${arch})
+endmacro(OVERRULE_ARCH)
+
+########################################################################
+# eliminate AVX on GCC < 4.4
+# even though it accepts -mavx, as won't assemble xgetbv, which we need
+########################################################################
+if(CPU_IS_x86 AND COMPILER_NAME MATCHES "GNU")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
+ OUTPUT_VARIABLE GCC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(GCC_VERSION VERSION_LESS "4.4")
+ OVERRULE_ARCH(avx "GCC missing xgetbv")
+ endif()
+endif()
+
+########################################################################
+# implement overruling in the ORC case,
+# since ORC always passes flag detection
+########################################################################
+if(NOT ORC_FOUND)
+ OVERRULE_ARCH(orc "ORC support not found")
+endif()
+
+########################################################################
+# implement overruling in the non-multilib case
+# this makes things work when both -m32 and -m64 pass
+########################################################################
+if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
+ include(CheckTypeSize)
+ check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
+ if (${SIZEOF_CPU} EQUAL 64)
+ OVERRULE_ARCH(32 "CPU width is 64 bits")
+ endif()
+ if (${SIZEOF_CPU} EQUAL 32)
+ OVERRULE_ARCH(64 "CPU width is 32 bits")
+ endif()
+
+ #MSVC 64 bit does not have MMX, overrule it
+ if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+ OVERRULE_ARCH(mmx "No MMX for Win64")
+ endif()
+
+endif()
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available architectures: ${available_archs}")
+
+########################################################################
+# determine available machines given the available architectures
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machines" --archs "${available_archs}"
+ OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+########################################################################
+# Implement machine overruling for redundant machines:
+# A machine is redundant when expansion rules occur,
+# and the arch superset passes configuration checks.
+# When this occurs, eliminate the redundant machines
+# to avoid unnecessary compilation of subset machines.
+########################################################################
+foreach(arch mmx orc 64 32)
+ foreach(machine_name ${available_machines})
+ string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
+ if (${machine_name} STREQUAL ${machine_name_no_arch})
+ else()
+ list(REMOVE_ITEM available_machines ${machine_name_no_arch})
+ endif()
+ endforeach(machine_name)
+endforeach(arch)
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available machines: ${available_machines}")
+
+########################################################################
+# Create rules to run the volk generator
+########################################################################
+
+#dependencies are all python, xml, and header implementation files
+file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
+file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
+file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h)
+
+macro(gen_template tmpl output)
+ list(APPEND volk_gen_sources ${output})
+ add_custom_command(
+ OUTPUT ${output}
+ DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_tmpl_utils.py
+ --input ${tmpl} --output ${output} ${ARGN}
+ )
+endmacro(gen_template)
+
+make_directory(${CMAKE_BINARY_DIR}/include/volk)
+
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_cpu.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_machines.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_machines.c)
+
+foreach(machine_name ${available_machines})
+ #generate machine source
+ set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c)
+ gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name})
+
+ #determine machine flags
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(${machine_name}_flags)
+ set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+ endif()
+
+ #add to available machine defs
+ string(TOUPPER LV_MACHINE_${machine_name} machine_def)
+ list(APPEND machine_defs ${machine_def})
+endforeach(machine_name)
+
+########################################################################
+# Set local include directories first
+########################################################################
+include_directories(
+ ${CMAKE_BINARY_DIR}/include
+ ${CMAKE_SOURCE_DIR}/include
+ ${CMAKE_SOURCE_DIR}/kernels
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+########################################################################
+# Handle orc support
+########################################################################
+if(ORC_FOUND)
+ #setup orc library usage
+ include_directories(${ORC_INCLUDE_DIRS})
+ link_directories(${ORC_LIBRARY_DIRS})
+ list(APPEND volk_libraries ${ORC_LIBRARIES})
+
+ #setup orc functions
+ file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc)
+ foreach(orc_file ${orc_files})
+
+ #extract the name for the generated c source from the orc file
+ get_filename_component(orc_file_name_we ${orc_file} NAME_WE)
+ set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c)
+
+ #create a rule to generate the source and add to the list of sources
+ add_custom_command(
+ COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file}
+ DEPENDS ${orc_file} OUTPUT ${orcc_gen}
+ )
+ list(APPEND volk_sources ${orcc_gen})
+
+ endforeach(orc_file)
+else()
+ message(STATUS "Did not find liborc and orcc, disabling orc support...")
+endif()
+
+########################################################################
+# Setup the volk sources list and library
+########################################################################
+if(NOT WIN32)
+ add_definitions(-fvisibility=hidden)
+endif()
+
+list(APPEND volk_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_prefs.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_rank_archs.c
+ ${volk_gen_sources}
+)
+
+#set the machine definitions where applicable
+set_source_files_properties(
+ ${CMAKE_CURRENT_BINARY_DIR}/volk.c
+ ${CMAKE_CURRENT_BINARY_DIR}/volk_machines.c
+PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
+
+if(MSVC)
+ #add compatibility includes for stdint types
+ include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
+ add_definitions(-DHAVE_CONFIG_H)
+ #compile the sources as C++ due to the lack of complex.h under MSVC
+ set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
+#create the volk runtime library
+add_library(volk SHARED ${volk_sources})
+target_link_libraries(volk ${volk_libraries})
+set_target_properties(volk PROPERTIES SOVERSION ${LIBVER})
+set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
+
+install(TARGETS volk
+ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file
+ RUNTIME DESTINATION bin COMPONENT "volk_runtime" # .dll file
+)
+
+########################################################################
+# Build the QA test application
+########################################################################
+
+
+if(Boost_FOUND)
+
+ set_source_files_properties(
+ ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES
+ COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN"
+ )
+
+ include_directories(${Boost_INCLUDE_DIRS})
+ link_directories(${Boost_LIBRARY_DIRS})
+
+ add_executable(test_all
+ ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
+ )
+ target_link_libraries(test_all volk ${Boost_LIBRARIES})
+ add_test(qa_volk_test_all test_all)
+
+endif(Boost_FOUND)
+
diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h
new file mode 100644
index 000000000..3c3f47b00
--- /dev/null
+++ b/volk/lib/gcc_x86_cpuid.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3 (1 << 0)
+#define bit_PCLMUL (1 << 1)
+#define bit_SSSE3 (1 << 9)
+#define bit_FMA (1 << 12)
+#define bit_CMPXCHG16B (1 << 13)
+#define bit_SSE4_1 (1 << 19)
+#define bit_SSE4_2 (1 << 20)
+#define bit_MOVBE (1 << 22)
+#define bit_POPCNT (1 << 23)
+#define bit_AES (1 << 25)
+#define bit_XSAVE (1 << 26)
+#define bit_OSXSAVE (1 << 27)
+#define bit_AVX (1 << 28)
+#define bit_F16C (1 << 29)
+#define bit_RDRND (1 << 30)
+
+/* %edx */
+#define bit_CMPXCHG8B (1 << 8)
+#define bit_CMOV (1 << 15)
+#define bit_MMX (1 << 23)
+#define bit_FXSAVE (1 << 24)
+#define bit_SSE (1 << 25)
+#define bit_SSE2 (1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM (1 << 0)
+#define bit_ABM (1 << 5)
+#define bit_SSE4a (1 << 6)
+#define bit_XOP (1 << 11)
+#define bit_LWP (1 << 15)
+#define bit_FMA4 (1 << 16)
+#define bit_TBM (1 << 21)
+
+/* %edx */
+#define bit_MMXEXT (1 << 22)
+#define bit_LM (1 << 29)
+#define bit_3DNOWP (1 << 30)
+#define bit_3DNOW (1 << 31)
+
+/* Extended Features (%eax == 7) */
+#define bit_FSGSBASE (1 << 0)
+#define bit_BMI (1 << 3)
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register. */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction. ext can
+ be either 0x0 or 0x8000000 to return highest supported value for
+ basic or extended cpuid information. Function returns 0 if cpuid
+ is not supported or whatever cpuid returns in eax register. If sig
+ pointer is non-null, then first four bytes of the signature
+ (as found in ebx register) are returned in location pointed by sig. */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+ unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+ /* See if we can use cpuid. On AMD64 we always can. */
+#if __GNUC__ >= 3
+ __asm__ ("pushf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "mov{l}\t{%0, %1|%1, %0}\n\t"
+ "xor{l}\t{%2, %0|%0, %2}\n\t"
+ "push{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+ __asm__ ("pushfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "movl\t%0, %1\n\t"
+ "xorl\t%2, %0\n\t"
+ "pushl\t%0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "popfl\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#endif
+
+ if (!((__eax ^ __ebx) & 0x00200000))
+ return 0;
+#endif
+
+ /* Host supports cpuid. Return highest supported cpuid input value. */
+ __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+ if (__sig)
+ *__sig = __ebx;
+
+ return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+ eax, ebx, ecx and edx registers. The function checks if cpuid is
+ supported and returns 1 for valid cpuid information or 0 for
+ unsupported cpuid level. All pointers are required to be non-null. */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+ unsigned int *__eax, unsigned int *__ebx,
+ unsigned int *__ecx, unsigned int *__edx)
+{
+ unsigned int __ext = __level & 0x80000000;
+
+ if (__get_cpuid_max (__ext, 0) < __level)
+ return 0;
+
+ __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+ return 1;
+}
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..8da43b972
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3200;
+ const int ITERS = 100000;
+ __VOLK_ATTR_ALIGNED(16) short input0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input4[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output01[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output11[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output21[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output31[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ input4[i] = plus4 - minus4;
+
+ }
+ printf("16s_add_quad_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+ CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+ CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..5a58569a1
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ const int num_iters = 1000000;
+ const int vlen = 32;
+
+ static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+ static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+ static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+ static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+ static char* permuters[4] = {permute0, permute1, permute2, permute3};
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ __VOLK_ATTR_ALIGNED(16) short target[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target3[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short src0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+ __VOLK_ATTR_ALIGNED(16) short cntl0[vlen] = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl1[vlen] = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl2[vlen] = {
+ 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = {
+ 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+ __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+
+ }
+
+
+ printf("16s_branch_4_state_8_aligned\n");
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time: %f\n", total);
+
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("branch_4_state_8_time, ssse3: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time, generic: %f\n", total);
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ CPPUNIT_ASSERT(target[i] == target3[i]);
+ }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..dadd2c580
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ const int vlen = 64;
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ __VOLK_ATTR_ALIGNED(16) short target[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short src0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+ permute_indexes[i] = (3 * i)%vlen;
+ cntl0[i] = 0xff;
+ cntl1[i] = 0xff * (i%2);
+ cntl2[i] = 0xff * ((i>>1)%2);
+ cntl3[i] = 0xff * ((i%4) == 3);
+ }
+
+ printf("16s_permute_and_scalar_add_aligned\n");
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("sse2_time: %f\n", total);
+
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..2a5dec44a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ const int vlen = 34;
+
+ __VOLK_ATTR_ALIGNED(16) short input0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input3[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output1[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = (short) (rand() - (RAND_MAX/2));
+ short plus1 = (short) (rand() - (RAND_MAX/2));
+ short plus2 = (short) (rand() - (RAND_MAX/2));
+ short plus3 = (short) (rand() - (RAND_MAX/2));
+
+ short minus0 = (short) (rand() - (RAND_MAX/2));
+ short minus1 = (short) (rand() - (RAND_MAX/2));
+ short minus2 = (short) (rand() - (RAND_MAX/2));
+ short minus3 = (short) (rand() - (RAND_MAX/2));
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ }
+
+ volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+ volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+ printf("16s_quad_max_star_aligned\n");
+ for(int i = 0; i < vlen; ++i) {
+ printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..4e792ec6c
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ __VOLK_ATTR_ALIGNED(16) float input0[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) float output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) float output01[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_fm_detect_aligned\n");
+
+ start = clock();
+ float save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..2df206726
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target_sse4_1;
+ unsigned int* target_sse;
+ unsigned int* target_generic;
+ float* src0 ;
+
+
+ unsigned int i_target_sse4_1;
+ target_sse4_1 = &i_target_sse4_1;
+ unsigned int i_target_sse;
+ target_sse = &i_target_sse;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+
+ ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+
+ random_floats((float*)src0, vlen);
+
+ printf("32f_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1 time: %f\n", total);
+
+
+ printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..3859bcb52
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target;
+ unsigned int* target_generic;
+ std::complex<float>* src0 ;
+
+
+ unsigned int i_target;
+ target = &i_target;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+ ret = posix_memalign((void**)&src0, 16, vlen << 3);
+
+ random_floats((float*)src0, vlen * 2);
+
+ printf("32fc_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+
+ printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+
+
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..daca31d9c
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ __VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) float output_generic[vlen];
+ __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen];
+
+ const float scalar = vlen;
+ const float rbw = 1.7;
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_power_spectral_density_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
new file mode 100644
index 000000000..b825c20e4
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
@@ -0,0 +1,138 @@
+#include <volk/volk.h>
+#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#else
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
new file mode 100644
index 000000000..f07402403
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..5559d933d
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ __VOLK_ATTR_ALIGNED(16) uint32_t input0;
+
+ __VOLK_ATTR_ALIGNED(16) uint32_t output0;
+ __VOLK_ATTR_ALIGNED(16) uint32_t output01;
+
+ input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("32u_popcnt_aligned\n");
+
+ start = clock();
+ uint32_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..391601f22
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ __VOLK_ATTR_ALIGNED(16) uint64_t input0;
+
+ __VOLK_ATTR_ALIGNED(16) uint64_t output0;
+ __VOLK_ATTR_ALIGNED(16) uint64_t output01;
+
+ input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("64u_popcnt_aligned\n");
+
+ start = clock();
+ uint64_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
new file mode 100644
index 000000000..e526eb2d0
--- /dev/null
+++ b/volk/lib/qa_utils.cc
@@ -0,0 +1,477 @@
+#include "qa_utils.h"
+#include <cstring>
+#include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
+#include <boost/tokenizer.hpp>
+#include <iostream>
+#include <vector>
+#include <list>
+#include <ctime>
+#include <cmath>
+#include <limits>
+#include <boost/lexical_cast.hpp>
+#include <volk/volk.h>
+#include <volk/volk_cpu.h>
+#include <volk/volk_common.h>
+#include <boost/typeof/typeof.hpp>
+#include <boost/type_traits.hpp>
+#include <stdio.h>
+
+float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+template <class t>
+void random_floats (t *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+void load_random_data(void *data, volk_type_t type, unsigned int n) {
+ if(type.is_complex) n *= 2;
+ if(type.is_float) {
+ if(type.size == 8) random_floats<double>((double *)data, n);
+ else random_floats<float>((float *)data, n);
+ } else {
+ float int_max = float(uint64_t(2) << (type.size*8));
+ if(type.is_signed) int_max /= 2.0;
+ for(unsigned int i=0; i<n; i++) {
+ float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
+ //man i really don't know how to do this in a more clever way, you have to cast down at some point
+ switch(type.size) {
+ case 8:
+ if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+ else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+ break;
+ case 4:
+ if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+ else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+ break;
+ case 2:
+ if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
+ else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
+ break;
+ case 1:
+ if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+ else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+ break;
+ default:
+ throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+ }
+ }
+ }
+}
+
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
+ std::vector<std::string> archlist;
+
+ for(size_t i = 0; i < desc.n_impls; i++) {
+ //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc
+ archlist.push_back(std::string(desc.impl_names[i]));
+ }
+
+ return archlist;
+}
+
+volk_type_t volk_type_from_string(std::string name) {
+ volk_type_t type;
+ type.is_float = false;
+ type.is_scalar = false;
+ type.is_complex = false;
+ type.is_signed = false;
+ type.size = 0;
+ type.str = name;
+
+ if(name.size() < 2) throw std::string("name too short to be a datatype");
+
+ //is it a scalar?
+ if(name[0] == 's') {
+ type.is_scalar = true;
+ name = name.substr(1, name.size()-1);
+ }
+
+ //get the data size
+ size_t last_size_pos = name.find_last_of("0123456789");
+ if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
+ //will throw if malformed
+ int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
+
+ assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+ type.size = size/8; //in bytes
+
+ for(size_t i=last_size_pos+1; i < name.size(); i++) {
+ switch (name[i]) {
+ case 'f':
+ type.is_float = true;
+ break;
+ case 'i':
+ type.is_signed = true;
+ break;
+ case 'c':
+ type.is_complex = true;
+ break;
+ case 'u':
+ type.is_signed = false;
+ break;
+ default:
+ throw;
+ }
+ }
+
+ return type;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::vector<volk_type_t> &outputsig,
+ std::string name) {
+ boost::char_separator<char> sep("_");
+ boost::tokenizer<boost::char_separator<char> > tok(name, sep);
+ std::vector<std::string> toked;
+ tok.assign(name);
+ toked.assign(tok.begin(), tok.end());
+
+ assert(toked[0] == "volk");
+ toked.erase(toked.begin());
+
+ //ok. we're assuming a string in the form
+ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+ std::string fn_name;
+ volk_type_t type;
+ BOOST_FOREACH(std::string token, toked) {
+ try {
+ type = volk_type_from_string(token);
+ if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+
+ if(side == SIDE_INPUT) inputsig.push_back(type);
+ else outputsig.push_back(type);
+ } catch (...){
+ if(token[0] == 'x') { //it's a multiplier
+ if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+ else assert(outputsig.size() > 0);
+ int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+ for(int i=1; i<multiplier; i++) {
+ if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+ else outputsig.push_back(outputsig.back());
+ }
+ }
+ else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+ side = SIDE_NAME;
+ fn_name.append("_");
+ fn_name.append(token);
+ }
+ else if(side == SIDE_OUTPUT) {
+ if(token != toked.back()) throw; //the last token in the name is the alignment
+ }
+ }
+ }
+ //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+ assert(inputsig.size() != 0);
+
+}
+
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision
+ if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+template <class t>
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ if(abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i])) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+class volk_qa_aligned_mem_pool{
+public:
+ void *get_new(size_t size){
+ size_t alignment = volk_get_alignment();
+ _mems.push_back(std::vector<char>(size + alignment-1, 0));
+ size_t ptr = size_t(&_mems.back().front());
+ return (void *)((ptr + alignment-1) & ~(alignment-1));
+ }
+private: std::list<std::vector<char> > _mems;
+};
+
+bool run_volk_tests(volk_func_desc_t desc,
+ void (*manual_func)(),
+ std::string name,
+ float tol,
+ lv_32fc_t scalar,
+ int vlen,
+ int iter,
+ std::vector<std::string> *best_arch_vector = 0,
+ std::string puppet_master_name = "NULL"
+ ) {
+ std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
+
+ //first let's get a list of available architectures for the test
+ std::vector<std::string> arch_list = get_arch_list(desc);
+
+ if(arch_list.size() < 2) {
+ std::cout << "no architectures to test" << std::endl;
+ return false;
+ }
+
+ //something that can hang onto memory and cleanup when this function exits
+ volk_qa_aligned_mem_pool mem_pool;
+
+ //now we have to get a function signature by parsing the name
+ std::vector<volk_type_t> inputsig, outputsig;
+ get_signatures_from_name(inputsig, outputsig, name);
+
+ //pull the input scalars into their own vector
+ std::vector<volk_type_t> inputsc;
+ for(size_t i=0; i<inputsig.size(); i++) {
+ if(inputsig[i].is_scalar) {
+ inputsc.push_back(inputsig[i]);
+ inputsig.erase(inputsig.begin() + i);
+ i -= 1;
+ }
+ }
+ //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
+ //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
+ std::vector<void *> inbuffs;
+ BOOST_FOREACH(volk_type_t sig, inputsig) {
+ if(!sig.is_scalar) //we don't make buffers for scalars
+ inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+ }
+ for(size_t i=0; i<inbuffs.size(); i++) {
+ load_random_data(inbuffs[i], inputsig[i], vlen);
+ }
+
+ //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+ std::vector<std::vector<void *> > test_data;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ std::vector<void *> arch_buffs;
+ for(size_t j=0; j<outputsig.size(); j++) {
+ arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+ }
+ for(size_t j=0; j<inputsig.size(); j++) {
+ arch_buffs.push_back(inbuffs[j]);
+ }
+ test_data.push_back(arch_buffs);
+ }
+
+ std::vector<volk_type_t> both_sigs;
+ both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+ both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+ //now run the test
+ clock_t start, end;
+ std::vector<double> profile_times;
+ for(size_t i = 0; i < arch_list.size(); i++) {
+ start = clock();
+
+ switch(both_sigs.size()) {
+ case 1:
+ if(inputsc.size() == 0) {
+ run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+ run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+ run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
+ default:
+ throw "no function handler for this signature";
+ break;
+ }
+
+ end = clock();
+ double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl;
+
+ profile_times.push_back(arch_time);
+ }
+
+ //and now compare each output to the generic output
+ //first we have to know which output is the generic one, they aren't in order...
+ size_t generic_offset=0;
+ for(size_t i=0; i<arch_list.size(); i++)
+ if(arch_list[i] == "generic") generic_offset=i;
+
+ //now compare
+ //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
+
+ bool fail = false;
+ bool fail_global = false;
+ std::vector<bool> arch_results;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ fail = false;
+ if(i != generic_offset) {
+ for(size_t j=0; j<both_sigs.size(); j++) {
+ if(both_sigs[j].is_float) {
+ if(both_sigs[j].size == 8) {
+ fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ } else {
+ //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+ switch(both_sigs[j].size) {
+ case 8:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 4:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 2:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 1:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ default:
+ fail=1;
+ }
+ }
+ if(fail) {
+ fail_global = true;
+ std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+ }
+ //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
+ }
+ }
+ arch_results.push_back(!fail);
+ }
+
+ double best_time_a = std::numeric_limits<double>::max();
+ double best_time_u = std::numeric_limits<double>::max();
+ std::string best_arch_a = "generic";
+ std::string best_arch_u = "generic";
+ for(size_t i=0; i < arch_list.size(); i++)
+ {
+ if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+ {
+ best_time_u = profile_times[i];
+ best_arch_u = arch_list[i];
+ }
+ if((profile_times[i] < best_time_a) && arch_results[i])
+ {
+ best_time_a = profile_times[i];
+ best_arch_a = arch_list[i];
+ }
+ }
+
+ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
+ if(best_arch_vector) {
+ if(puppet_master_name == "NULL") {
+ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
+ }
+ else {
+ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
+ }
+ }
+
+ return fail_global;
+}
+
+
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
new file mode 100644
index 000000000..0f17cdaa3
--- /dev/null
+++ b/volk/lib/qa_utils.h
@@ -0,0 +1,41 @@
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <volk/volk.h>
+#include <volk/volk_common.h>
+
+struct volk_type_t {
+ bool is_float;
+ bool is_scalar;
+ bool is_signed;
+ bool is_complex;
+ int size;
+ std::string str;
+};
+
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float *buf, unsigned n);
+
+bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
+
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
+#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+
+#endif //VOLK_QA_UTILS_H
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
new file mode 100644
index 000000000..f133897cb
--- /dev/null
+++ b/volk/lib/testqa.cc
@@ -0,0 +1,90 @@
+#include "qa_utils.h"
+#include <volk/volk.h>
+#include <boost/test/unit_test.hpp>
+
+//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
+//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000);
+//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000);
+VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c
new file mode 100644
index 000000000..f787b5e2a
--- /dev/null
+++ b/volk/lib/volk_prefs.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <volk/volk_prefs.h>
+
+//#if defined(_WIN32)
+//#include <Windows.h>
+//#endif
+
+void volk_get_config_path(char *path)
+{
+ const char *suffix = "/.volk/volk_config";
+ char *home = NULL;
+ if (home == NULL) home = getenv("HOME");
+ if (home == NULL) home = getenv("APPDATA");
+ if (home == NULL){
+ path = NULL;
+ return;
+ }
+ strcpy(path, home);
+ strcat(path, suffix);
+}
+
+size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+{
+ FILE *config_file;
+ char path[512], line[512];
+ size_t n_arch_prefs = 0;
+ volk_arch_pref_t *prefs = NULL;
+
+ //get the config path
+ volk_get_config_path(path);
+ if (path == NULL) return n_arch_prefs; //no prefs found
+ config_file = fopen(path, "r");
+ if(!config_file) return n_arch_prefs; //no prefs found
+
+ //reset the file pointer and write the prefs into volk_arch_prefs
+ while(fgets(line, sizeof(line), config_file) != NULL)
+ {
+ prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
+ volk_arch_pref_t *p = prefs + n_arch_prefs;
+ if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+ {
+ n_arch_prefs++;
+ }
+ }
+ fclose(config_file);
+ *prefs_res = prefs;
+ return n_arch_prefs;
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..6ab013f26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk_rank_archs.h>
+#include <volk/volk_prefs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
+ #define __popcnt __builtin_popcount
+#else
+ inline unsigned __popcnt(unsigned num)
+ {
+ unsigned pop = 0;
+ while(num)
+ {
+ if (num & 0x1) pop++;
+ num >>= 1;
+ }
+ return pop;
+ }
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+){
+ unsigned int i;
+ for (i = 0; i < n_impls; i++) {
+ if(!strncmp(impl_names[i], impl_name, 20)) {
+ return i;
+ }
+ }
+ //TODO return -1;
+ //something terrible should happen here
+ printf("Volk warning: no arch found, returning generic impl\n");
+ return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+}
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+){
+ size_t i;
+ static volk_arch_pref_t *volk_arch_prefs;
+ static size_t n_arch_prefs = 0;
+ static int prefs_loaded = 0;
+ if(!prefs_loaded) {
+ n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
+ prefs_loaded = 1;
+ }
+
+ //now look for the function name in the prefs list
+ for(i = 0; i < n_arch_prefs; i++)
+ {
+ if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+ {
+ const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ return volk_get_index(impl_names, n_impls, impl_name);
+ }
+ }
+
+ //return the best index with the largest deps
+ size_t best_index_a = 0;
+ size_t best_index_u = 0;
+ int best_value_a = -1;
+ int best_value_u = -1;
+ for(i = 0; i < n_impls; i++)
+ {
+ const signed val = __popcnt(impl_deps[i]);
+ if (alignment[i] && val > best_value_a)
+ {
+ best_index_a = i;
+ best_value_a = val;
+ }
+ if (!alignment[i] && val > best_value_u)
+ {
+ best_index_u = i;
+ best_value_u = val;
+ }
+ }
+
+ //when align and we found a best aligned, use it
+ if (align && best_value_a != -1) return best_index_a;
+
+ //otherwise return the best unaligned
+ return best_index_u;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..b3bf8ff17
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+);
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/