diff options
Diffstat (limited to 'volk/lib')
30 files changed, 2470 insertions, 0 deletions
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt new file mode 100644 index 000000000..68fadc35b --- /dev/null +++ b/volk/lib/CMakeLists.txt @@ -0,0 +1,352 @@ +# +# Copyright 2011-2012 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +######################################################################## +# header file detection +######################################################################## +include(CheckIncludeFile) +CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) +if(HAVE_CPUID_H) + add_definitions(-DHAVE_CPUID_H) +endif() + +CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H) +if(HAVE_INTRIN_H) + add_definitions(-DHAVE_INTRIN_H) +endif() + +CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H) +if(HAVE_FENV_H) + add_definitions(-DHAVE_FENV_H) +endif() + +CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H) +if(HAVE_DLFCN_H) + add_definitions(-DHAVE_DLFCN_H) + list(APPEND volk_libraries ${CMAKE_DL_LIBS}) +endif() + +######################################################################## +# Setup the compiler name +######################################################################## +set(COMPILER_NAME ${CMAKE_C_COMPILER_ID}) +if(MSVC) #its not set otherwise + set(COMPILER_NAME MSVC) +endif() + +message(STATUS "Compiler name: ${COMPILER_NAME}") + +if(NOT DEFINED COMPILER_NAME) + message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.") +endif() + +######################################################################## +# Special clang flag so flag checks can fail +######################################################################## +if(COMPILER_NAME MATCHES "GNU") + include(CheckCXXCompilerFlag) + CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG) + if(HAVE_WERROR_UNUSED_CMD_LINE_ARG) + set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument") + endif() +endif() + +######################################################################## +# detect x86 flavor of CPU +######################################################################## +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$") + message(STATUS "x86* CPU detected") + set(CPU_IS_x86 TRUE) +endif() + +######################################################################## +# determine passing architectures based on compile flag tests +######################################################################## +execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "arch_flags" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE +) + +macro(check_arch arch_name) + set(flags ${ARGN}) + set(have_${arch_name} TRUE) + foreach(flag ${flags}) + include(CheckCXXCompilerFlag) + set(have_flag have${flag}) + execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary) + COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))" + OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(VOLK_FLAG_CHECK_FLAGS) + set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS}) + endif() + CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag}) + unset(CMAKE_REQUIRED_FLAGS) + if (NOT ${have_flag}) + set(have_${arch_name} FALSE) + endif() + endforeach(flag) + if (have_${arch_name}) + list(APPEND available_archs ${arch_name}) + endif() +endmacro(check_arch) + +foreach(line ${arch_flag_lines}) + string(REGEX REPLACE "," ";" arch_flags ${line}) + check_arch(${arch_flags}) +endforeach(line) + +macro(OVERRULE_ARCH arch reason) + message(STATUS "${reason}, Overruled arch ${arch}") + list(REMOVE_ITEM available_archs ${arch}) +endmacro(OVERRULE_ARCH) + +######################################################################## +# eliminate AVX on GCC < 4.4 +# even though it accepts -mavx, as won't assemble xgetbv, which we need +######################################################################## +if(CPU_IS_x86 AND COMPILER_NAME MATCHES "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion + OUTPUT_VARIABLE GCC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + if(GCC_VERSION VERSION_LESS "4.4") + OVERRULE_ARCH(avx "GCC missing xgetbv") + endif() +endif() + +######################################################################## +# implement overruling in the ORC case, +# since ORC always passes flag detection +######################################################################## +if(NOT ORC_FOUND) + OVERRULE_ARCH(orc "ORC support not found") +endif() + +######################################################################## +# implement overruling in the non-multilib case +# this makes things work when both -m32 and -m64 pass +######################################################################## +if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) + include(CheckTypeSize) + check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY) + if (${SIZEOF_CPU} EQUAL 64) + OVERRULE_ARCH(32 "CPU width is 64 bits") + endif() + if (${SIZEOF_CPU} EQUAL 32) + OVERRULE_ARCH(64 "CPU width is 32 bits") + endif() + + #MSVC 64 bit does not have MMX, overrule it + if (${SIZEOF_CPU} EQUAL 64 AND MSVC) + OVERRULE_ARCH(mmx "No MMX for Win64") + endif() + +endif() + +######################################################################## +# done overrules! print the result +######################################################################## +message(STATUS "Available architectures: ${available_archs}") + +######################################################################## +# determine available machines given the available architectures +######################################################################## +execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "machines" --archs "${available_archs}" + OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE +) + +######################################################################## +# Implement machine overruling for redundant machines: +# A machine is redundant when expansion rules occur, +# and the arch superset passes configuration checks. +# When this occurs, eliminate the redundant machines +# to avoid unnecessary compilation of subset machines. +######################################################################## +foreach(arch mmx orc 64 32) + foreach(machine_name ${available_machines}) + string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name}) + if (${machine_name} STREQUAL ${machine_name_no_arch}) + else() + list(REMOVE_ITEM available_machines ${machine_name_no_arch}) + endif() + endforeach(machine_name) +endforeach(arch) + +######################################################################## +# done overrules! print the result +######################################################################## +message(STATUS "Available machines: ${available_machines}") + +######################################################################## +# Create rules to run the volk generator +######################################################################## + +#dependencies are all python, xml, and header implementation files +file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml) +file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py) +file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h) + +macro(gen_template tmpl output) + list(APPEND volk_gen_sources ${output}) + add_custom_command( + OUTPUT ${output} + DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl} + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_tmpl_utils.py + --input ${tmpl} --output ${output} ${ARGN} + ) +endmacro(gen_template) + +make_directory(${CMAKE_BINARY_DIR}/include/volk) + +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_cpu.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_machines.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_machines.c) + +foreach(machine_name ${available_machines}) + #generate machine source + set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c) + gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name}) + + #determine machine flags + execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(${machine_name}_flags) + set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}") + endif() + + #add to available machine defs + string(TOUPPER LV_MACHINE_${machine_name} machine_def) + list(APPEND machine_defs ${machine_def}) +endforeach(machine_name) + +######################################################################## +# Set local include directories first +######################################################################## +include_directories( + ${CMAKE_BINARY_DIR}/include + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/kernels + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) + +######################################################################## +# Handle orc support +######################################################################## +if(ORC_FOUND) + #setup orc library usage + include_directories(${ORC_INCLUDE_DIRS}) + link_directories(${ORC_LIBRARY_DIRS}) + list(APPEND volk_libraries ${ORC_LIBRARIES}) + + #setup orc functions + file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc) + foreach(orc_file ${orc_files}) + + #extract the name for the generated c source from the orc file + get_filename_component(orc_file_name_we ${orc_file} NAME_WE) + set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c) + + #create a rule to generate the source and add to the list of sources + add_custom_command( + COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file} + DEPENDS ${orc_file} OUTPUT ${orcc_gen} + ) + list(APPEND volk_sources ${orcc_gen}) + + endforeach(orc_file) +else() + message(STATUS "Did not find liborc and orcc, disabling orc support...") +endif() + +######################################################################## +# Setup the volk sources list and library +######################################################################## +if(NOT WIN32) + add_definitions(-fvisibility=hidden) +endif() + +list(APPEND volk_sources + ${CMAKE_CURRENT_SOURCE_DIR}/volk_prefs.c + ${CMAKE_CURRENT_SOURCE_DIR}/volk_rank_archs.c + ${volk_gen_sources} +) + +#set the machine definitions where applicable +set_source_files_properties( + ${CMAKE_CURRENT_BINARY_DIR}/volk.c + ${CMAKE_CURRENT_BINARY_DIR}/volk_machines.c +PROPERTIES COMPILE_DEFINITIONS "${machine_defs}") + +if(MSVC) + #add compatibility includes for stdint types + include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc) + add_definitions(-DHAVE_CONFIG_H) + #compile the sources as C++ due to the lack of complex.h under MSVC + set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX) +endif() + +#create the volk runtime library +add_library(volk SHARED ${volk_sources}) +target_link_libraries(volk ${volk_libraries}) +set_target_properties(volk PROPERTIES SOVERSION ${LIBVER}) +set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS") + +install(TARGETS volk + LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file + ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file + RUNTIME DESTINATION bin COMPONENT "volk_runtime" # .dll file +) + +######################################################################## +# Build the QA test application +######################################################################## + + +if(Boost_FOUND) + + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES + COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN" + ) + + include_directories(${Boost_INCLUDE_DIRS}) + link_directories(${Boost_LIBRARY_DIRS}) + + add_executable(test_all + ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc + ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc + ) + target_link_libraries(test_all volk ${Boost_LIBRARIES}) + add_test(qa_volk_test_all test_all) + +endif(Boost_FOUND) + diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h new file mode 100644 index 000000000..3c3f47b00 --- /dev/null +++ b/volk/lib/gcc_x86_cpuid.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * <http://www.gnu.org/licenses/>. + */ + +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) + +/* %edx */ +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +/* Extended Features */ +/* %ecx */ +#define bit_LAHF_LM (1 << 0) +#define bit_ABM (1 << 5) +#define bit_SSE4a (1 << 6) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) + +/* %edx */ +#define bit_MMXEXT (1 << 22) +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1 << 31) + +/* Extended Features (%eax == 7) */ +#define bit_FSGSBASE (1 << 0) +#define bit_BMI (1 << 3) + +#if defined(__i386__) && defined(__PIC__) +/* %ebx may be the PIC register. */ +#if __GNUC__ >= 3 +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif +#else +#define __cpuid(level, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif + +/* Return highest supported input value for cpuid instruction. ext can + be either 0x0 or 0x8000000 to return highest supported value for + basic or extended cpuid information. Function returns 0 if cpuid + is not supported or whatever cpuid returns in eax register. If sig + pointer is non-null, then first four bytes of the signature + (as found in ebx register) are returned in location pointed by sig. */ + +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ + unsigned int __eax, __ebx, __ecx, __edx; + +#ifndef __x86_64__ + /* See if we can use cpuid. On AMD64 we always can. */ +#if __GNUC__ >= 3 + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#endif + + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; +#endif + + /* Host supports cpuid. Return highest supported cpuid input value. */ + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + + if (__sig) + *__sig = __ebx; + + return __eax; +} + +/* Return cpuid data for requested cpuid level, as found in returned + eax, ebx, ecx and edx registers. The function checks if cpuid is + supported and returns 1 for valid cpuid information or 0 for + unsupported cpuid level. All pointers are required to be non-null. */ + +static __inline int +__get_cpuid (unsigned int __level, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __ext = __level & 0x80000000; + + if (__get_cpuid_max (__ext, 0) < __level) + return 0; + + __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..8da43b972 --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_16s_add_quad_aligned16.h> +#include <volk/volk_16s_add_quad_aligned16.h> +#include <cstdlib> +#include <ctime> +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; + __VOLK_ATTR_ALIGNED(16) short input1[vlen]; + __VOLK_ATTR_ALIGNED(16) short input2[vlen]; + __VOLK_ATTR_ALIGNED(16) short input3[vlen]; + __VOLK_ATTR_ALIGNED(16) short input4[vlen]; + + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; + __VOLK_ATTR_ALIGNED(16) short output1[vlen]; + __VOLK_ATTR_ALIGNED(16) short output2[vlen]; + __VOLK_ATTR_ALIGNED(16) short output3[vlen]; + __VOLK_ATTR_ALIGNED(16) short output01[vlen]; + __VOLK_ATTR_ALIGNED(16) short output11[vlen]; + __VOLK_ATTR_ALIGNED(16) short output21[vlen]; + __VOLK_ATTR_ALIGNED(16) short output31[vlen]; + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..5a58569a1 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include <volk/volk.h> +#include <qa_16s_branch_4_state_8_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + __VOLK_ATTR_ALIGNED(16) short target[vlen]; + __VOLK_ATTR_ALIGNED(16) short target2[vlen]; + __VOLK_ATTR_ALIGNED(16) short target3[vlen]; + + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; + __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + __VOLK_ATTR_ALIGNED(16) short cntl0[vlen] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl1[vlen] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl2[vlen] = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..dadd2c580 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include <volk/volk.h> +#include <qa_16s_permute_and_scalar_add_aligned16.h> +#include <volk/volk_16s_permute_and_scalar_add_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + __VOLK_ATTR_ALIGNED(16) short target[vlen]; + __VOLK_ATTR_ALIGNED(16) short target2[vlen]; + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; + __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl0[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl1[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl2[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl3[vlen]; + __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..2a5dec44a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16s_quad_max_star_aligned16.h> +#include <volk/volk_16s_quad_max_star_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; + __VOLK_ATTR_ALIGNED(16) short input1[vlen]; + __VOLK_ATTR_ALIGNED(16) short input2[vlen]; + __VOLK_ATTR_ALIGNED(16) short input3[vlen]; + + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; + __VOLK_ATTR_ALIGNED(16) short output1[vlen]; + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..4e792ec6c --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,61 @@ +#include <volk/volk.h> +#include <qa_32f_fm_detect_aligned16.h> +#include <volk/volk_32f_fm_detect_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + __VOLK_ATTR_ALIGNED(16) float input0[vlen]; + + __VOLK_ATTR_ALIGNED(16) float output0[vlen]; + __VOLK_ATTR_ALIGNED(16) float output01[vlen]; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..2df206726 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..3859bcb52 --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_32fc_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex<float>* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..daca31d9c --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,64 @@ +#include <volk/volk.h> +#include <qa_32fc_power_spectral_density_32f_aligned16.h> +#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + __VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen]; + + __VOLK_ATTR_ALIGNED(16) float output_generic[vlen]; + __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen]; + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc new file mode 100644 index 000000000..b825c20e4 --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc @@ -0,0 +1,138 @@ +#include <volk/volk.h> +#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h new file mode 100644 index 000000000..f07402403 --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H +#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */ diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc new file mode 100644 index 000000000..5559d933d --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -0,0 +1,62 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32u_popcnt_aligned16.h> +#include <volk/volk_32u_popcnt_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_32u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_32u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + __VOLK_ATTR_ALIGNED(16) uint32_t input0; + + __VOLK_ATTR_ALIGNED(16) uint32_t output0; + __VOLK_ATTR_ALIGNED(16) uint32_t output01; + + input0 = ((uint32_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("32u_popcnt_aligned\n"); + + start = clock(); + uint32_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_32u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h new file mode 100644 index 000000000..fa1dc1041 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc new file mode 100644 index 000000000..391601f22 --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -0,0 +1,62 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_64u_popcnt_aligned16.h> +#include <volk/volk_64u_popcnt_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_64u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_64u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + __VOLK_ATTR_ALIGNED(16) uint64_t input0; + + __VOLK_ATTR_ALIGNED(16) uint64_t output0; + __VOLK_ATTR_ALIGNED(16) uint64_t output01; + + input0 = ((uint64_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("64u_popcnt_aligned\n"); + + start = clock(); + uint64_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_64u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h new file mode 100644 index 000000000..217822d6e --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc new file mode 100644 index 000000000..e526eb2d0 --- /dev/null +++ b/volk/lib/qa_utils.cc @@ -0,0 +1,477 @@ +#include "qa_utils.h" +#include <cstring> +#include <boost/foreach.hpp> +#include <boost/assign/list_of.hpp> +#include <boost/tokenizer.hpp> +#include <iostream> +#include <vector> +#include <list> +#include <ctime> +#include <cmath> +#include <limits> +#include <boost/lexical_cast.hpp> +#include <volk/volk.h> +#include <volk/volk_cpu.h> +#include <volk/volk_common.h> +#include <boost/typeof/typeof.hpp> +#include <boost/type_traits.hpp> +#include <stdio.h> + +float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +template <class t> +void random_floats (t *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +void load_random_data(void *data, volk_type_t type, unsigned int n) { + if(type.is_complex) n *= 2; + if(type.is_float) { + if(type.size == 8) random_floats<double>((double *)data, n); + else random_floats<float>((float *)data, n); + } else { + float int_max = float(uint64_t(2) << (type.size*8)); + if(type.is_signed) int_max /= 2.0; + for(unsigned int i=0; i<n; i++) { + float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max; + //man i really don't know how to do this in a more clever way, you have to cast down at some point + switch(type.size) { + case 8: + if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; + else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; + break; + case 4: + if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; + else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; + break; + case 2: + if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; + else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; + break; + case 1: + if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; + else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; + break; + default: + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here + } + } + } +} + +static std::vector<std::string> get_arch_list(volk_func_desc_t desc) { + std::vector<std::string> archlist; + + for(size_t i = 0; i < desc.n_impls; i++) { + //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc + archlist.push_back(std::string(desc.impl_names[i])); + } + + return archlist; +} + +volk_type_t volk_type_from_string(std::string name) { + volk_type_t type; + type.is_float = false; + type.is_scalar = false; + type.is_complex = false; + type.is_signed = false; + type.size = 0; + type.str = name; + + if(name.size() < 2) throw std::string("name too short to be a datatype"); + + //is it a scalar? + if(name[0] == 's') { + type.is_scalar = true; + name = name.substr(1, name.size()-1); + } + + //get the data size + size_t last_size_pos = name.find_last_of("0123456789"); + if(last_size_pos < 0) throw std::string("no size spec in type ").append(name); + //will throw if malformed + int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1)); + + assert(((size % 8) == 0) && (size <= 64) && (size != 0)); + type.size = size/8; //in bytes + + for(size_t i=last_size_pos+1; i < name.size(); i++) { + switch (name[i]) { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } + } + + return type; +} + +static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, + std::vector<volk_type_t> &outputsig, + std::string name) { + boost::char_separator<char> sep("_"); + boost::tokenizer<boost::char_separator<char> > tok(name, sep); + std::vector<std::string> toked; + tok.assign(name); + toked.assign(tok.begin(), tok.end()); + + assert(toked[0] == "volk"); + toked.erase(toked.begin()); + + //ok. we're assuming a string in the form + //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) + + enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; + std::string fn_name; + volk_type_t type; + BOOST_FOREACH(std::string token, toked) { + try { + type = volk_type_from_string(token); + if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + + if(side == SIDE_INPUT) inputsig.push_back(type); + else outputsig.push_back(type); + } catch (...){ + if(token[0] == 'x') { //it's a multiplier + if(side == SIDE_INPUT) assert(inputsig.size() > 0); + else assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid + for(int i=1; i<multiplier; i++) { + if(side == SIDE_INPUT) inputsig.push_back(inputsig.back()); + else outputsig.push_back(outputsig.back()); + } + } + else if(side == SIDE_INPUT) { //it's the function name, at least it better be + side = SIDE_NAME; + fn_name.append("_"); + fn_name.append(token); + } + else if(side == SIDE_OUTPUT) { + if(token != toked.back()) throw; //the last token in the name is the alignment + } + } + } + //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! + assert(inputsig.size() != 0); + +} + +inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], vlen, arch.c_str()); +} + +inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +} + +inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +} + +inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +} + +inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +template <class t> +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; + for(unsigned int i=0; i<vlen; i++) { + if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision + if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; + } + } + } + + return fail; +} + +template <class t> +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { + bool fail = false; + int print_max_errs = 10; + for(unsigned int i=0; i<vlen; i++) { + if(abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i])) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl; + } + } + } + + return fail; +} + +class volk_qa_aligned_mem_pool{ +public: + void *get_new(size_t size){ + size_t alignment = volk_get_alignment(); + _mems.push_back(std::vector<char>(size + alignment-1, 0)); + size_t ptr = size_t(&_mems.back().front()); + return (void *)((ptr + alignment-1) & ~(alignment-1)); + } +private: std::list<std::vector<char> > _mems; +}; + +bool run_volk_tests(volk_func_desc_t desc, + void (*manual_func)(), + std::string name, + float tol, + lv_32fc_t scalar, + int vlen, + int iter, + std::vector<std::string> *best_arch_vector = 0, + std::string puppet_master_name = "NULL" + ) { + std::cout << "RUN_VOLK_TESTS: " << name << std::endl; + + //first let's get a list of available architectures for the test + std::vector<std::string> arch_list = get_arch_list(desc); + + if(arch_list.size() < 2) { + std::cout << "no architectures to test" << std::endl; + return false; + } + + //something that can hang onto memory and cleanup when this function exits + volk_qa_aligned_mem_pool mem_pool; + + //now we have to get a function signature by parsing the name + std::vector<volk_type_t> inputsig, outputsig; + get_signatures_from_name(inputsig, outputsig, name); + + //pull the input scalars into their own vector + std::vector<volk_type_t> inputsc; + for(size_t i=0; i<inputsig.size(); i++) { + if(inputsig[i].is_scalar) { + inputsc.push_back(inputsig[i]); + inputsig.erase(inputsig.begin() + i); + i -= 1; + } + } + //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl; + //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl; + std::vector<void *> inbuffs; + BOOST_FOREACH(volk_type_t sig, inputsig) { + if(!sig.is_scalar) //we don't make buffers for scalars + inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); + } + for(size_t i=0; i<inbuffs.size(); i++) { + load_random_data(inbuffs[i], inputsig[i], vlen); + } + + //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch + std::vector<std::vector<void *> > test_data; + for(size_t i=0; i<arch_list.size(); i++) { + std::vector<void *> arch_buffs; + for(size_t j=0; j<outputsig.size(); j++) { + arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1))); + } + for(size_t j=0; j<inputsig.size(); j++) { + arch_buffs.push_back(inbuffs[j]); + } + test_data.push_back(arch_buffs); + } + + std::vector<volk_type_t> both_sigs; + both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); + both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); + + //now run the test + clock_t start, end; + std::vector<double> profile_times; + for(size_t i = 0; i < arch_list.size(); i++) { + start = clock(); + + switch(both_sigs.size()) { + case 1: + if(inputsc.size() == 0) { + run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } else throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if(inputsc.size() == 0) { + run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } else throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if(inputsc.size() == 0) { + run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } else throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; + default: + throw "no function handler for this signature"; + break; + } + + end = clock(); + double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC; + std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl; + + profile_times.push_back(arch_time); + } + + //and now compare each output to the generic output + //first we have to know which output is the generic one, they aren't in order... + size_t generic_offset=0; + for(size_t i=0; i<arch_list.size(); i++) + if(arch_list[i] == "generic") generic_offset=i; + + //now compare + //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know + + bool fail = false; + bool fail_global = false; + std::vector<bool> arch_results; + for(size_t i=0; i<arch_list.size(); i++) { + fail = false; + if(i != generic_offset) { + for(size_t j=0; j<both_sigs.size(); j++) { + if(both_sigs[j].is_float) { + if(both_sigs[j].size == 8) { + fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + } else { + //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ + switch(both_sigs[j].size) { + case 8: + if(both_sigs[j].is_signed) { + fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 4: + if(both_sigs[j].is_signed) { + fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 2: + if(both_sigs[j].is_signed) { + fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 1: + if(both_sigs[j].is_signed) { + fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + default: + fail=1; + } + } + if(fail) { + fail_global = true; + std::cout << name << ": fail on arch " << arch_list[i] << std::endl; + } + //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1)); + } + } + arch_results.push_back(!fail); + } + + double best_time_a = std::numeric_limits<double>::max(); + double best_time_u = std::numeric_limits<double>::max(); + std::string best_arch_a = "generic"; + std::string best_arch_u = "generic"; + for(size_t i=0; i < arch_list.size(); i++) + { + if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + { + best_time_u = profile_times[i]; + best_arch_u = arch_list[i]; + } + if((profile_times[i] < best_time_a) && arch_results[i]) + { + best_time_a = profile_times[i]; + best_arch_a = arch_list[i]; + } + } + + std::cout << "Best aligned arch: " << best_arch_a << std::endl; + std::cout << "Best unaligned arch: " << best_arch_u << std::endl; + if(best_arch_vector) { + if(puppet_master_name == "NULL") { + best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); + } + else { + best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); + } + } + + return fail_global; +} + + diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h new file mode 100644 index 000000000..0f17cdaa3 --- /dev/null +++ b/volk/lib/qa_utils.h @@ -0,0 +1,41 @@ +#ifndef VOLK_QA_UTILS_H +#define VOLK_QA_UTILS_H + +#include <cstdlib> +#include <string> +#include <vector> +#include <volk/volk.h> +#include <volk/volk_common.h> + +struct volk_type_t { + bool is_float; + bool is_scalar; + bool is_signed; + bool is_complex; + int size; + std::string str; +}; + +volk_type_t volk_type_from_string(std::string); + +float uniform(void); +void random_floats(float *buf, unsigned n); + +bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); + + +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL") +#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func)) +typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place +typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); +typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); + +#endif //VOLK_QA_UTILS_H diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc new file mode 100644 index 000000000..f133897cb --- /dev/null +++ b/volk/lib/testqa.cc @@ -0,0 +1,90 @@ +#include "qa_utils.h" +#include <volk/volk.h> +#include <boost/test/unit_test.hpp> + +//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000); +VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1); +VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1); +VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1); +//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1); +VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c new file mode 100644 index 000000000..f787b5e2a --- /dev/null +++ b/volk/lib/volk_prefs.c @@ -0,0 +1,50 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <volk/volk_prefs.h> + +//#if defined(_WIN32) +//#include <Windows.h> +//#endif + +void volk_get_config_path(char *path) +{ + const char *suffix = "/.volk/volk_config"; + char *home = NULL; + if (home == NULL) home = getenv("HOME"); + if (home == NULL) home = getenv("APPDATA"); + if (home == NULL){ + path = NULL; + return; + } + strcpy(path, home); + strcat(path, suffix); +} + +size_t volk_load_preferences(volk_arch_pref_t **prefs_res) +{ + FILE *config_file; + char path[512], line[512]; + size_t n_arch_prefs = 0; + volk_arch_pref_t *prefs = NULL; + + //get the config path + volk_get_config_path(path); + if (path == NULL) return n_arch_prefs; //no prefs found + config_file = fopen(path, "r"); + if(!config_file) return n_arch_prefs; //no prefs found + + //reset the file pointer and write the prefs into volk_arch_prefs + while(fgets(line, sizeof(line), config_file) != NULL) + { + prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + volk_arch_pref_t *p = prefs + n_arch_prefs; + if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5)) + { + n_arch_prefs++; + } + } + fclose(config_file); + *prefs_res = prefs; + return n_arch_prefs; +} diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c new file mode 100644 index 000000000..6ab013f26 --- /dev/null +++ b/volk/lib/volk_rank_archs.c @@ -0,0 +1,112 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include <volk_rank_archs.h> +#include <volk/volk_prefs.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4 + #define __popcnt __builtin_popcount +#else + inline unsigned __popcnt(unsigned num) + { + unsigned pop = 0; + while(num) + { + if (num & 0x1) pop++; + num >>= 1; + } + return pop; + } +#endif + +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +){ + unsigned int i; + for (i = 0; i < n_impls; i++) { + if(!strncmp(impl_names[i], impl_name, 20)) { + return i; + } + } + //TODO return -1; + //something terrible should happen here + printf("Volk warning: no arch found, returning generic impl\n"); + return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now +} + +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +){ + size_t i; + static volk_arch_pref_t *volk_arch_prefs; + static size_t n_arch_prefs = 0; + static int prefs_loaded = 0; + if(!prefs_loaded) { + n_arch_prefs = volk_load_preferences(&volk_arch_prefs); + prefs_loaded = 1; + } + + //now look for the function name in the prefs list + for(i = 0; i < n_arch_prefs; i++) + { + if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it + { + const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; + return volk_get_index(impl_names, n_impls, impl_name); + } + } + + //return the best index with the largest deps + size_t best_index_a = 0; + size_t best_index_u = 0; + int best_value_a = -1; + int best_value_u = -1; + for(i = 0; i < n_impls; i++) + { + const signed val = __popcnt(impl_deps[i]); + if (alignment[i] && val > best_value_a) + { + best_index_a = i; + best_value_a = val; + } + if (!alignment[i] && val > best_value_u) + { + best_index_u = i; + best_value_u = val; + } + } + + //when align and we found a best aligned, use it + if (align && best_value_a != -1) return best_index_a; + + //otherwise return the best unaligned + return best_index_u; +} diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h new file mode 100644 index 000000000..b3bf8ff17 --- /dev/null +++ b/volk/lib/volk_rank_archs.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#include <stdlib.h> +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +); + +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +); + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ |