diff options
Diffstat (limited to 'volk/lib')
30 files changed, 2068 insertions, 0 deletions
diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore new file mode 100644 index 000000000..6a5fde28f --- /dev/null +++ b/volk/lib/.gitignore @@ -0,0 +1,23 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/volk.c +/volk_cpu_generic.c +/volk_cpu_powerpc.c +/volk_cpu_x86.c +/volk_environment_init.c +/volk_init.c +/volk_init.h +/volk_mktables +/volk_mktables.c +/volk_proccpu_sim.c +/volk_runtime.c +/test_all +/testqa diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am new file mode 100644 index 000000000..3e5502369 --- /dev/null +++ b/volk/lib/Makefile.am @@ -0,0 +1,157 @@ +# +# Copyright 2010,2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +#FIXME: forcing the top_builddir for distcheck seems like a bit +# of a hack. Figure out the right way to do this to find built +# volk_config.h and volk_tables.h + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ + -I$(top_builddir)/include \ + $(LV_CXXFLAGS) $(WITH_INCLUDES) + + +# We build 1 library and 1 executable here. The library contains +# everything except the QA code. The C++ QA code is especially recommended +# when you have general purpose C or C++ code that may not get +# thoroughly exercised by building and running a GR block. The +# executable runs the QA code at "make check" time. +# +# +# +# N.B., If there's a SWIG generated shared library and associated +# python code, it will be contained in ../python, not here. (That +# code is conditionally built depending on the state of the +# --without-python configure option.) However, the .i should be here +# next to the .h that it's based on. + + +# list of programs run by "make check" and "make distcheck" +#TESTS = testqa +#orc stuff gets built in the ORC directory conditional to ORC being enabled. +#it gets linked in during the build of libvolk as an added library. +#there might be a better way to do this. + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la + +EXTRA_DIST = \ + volk_mktables.c \ + volk_rank_archs.h \ + volk_proccpu_sim.c \ + gcc_x86_cpuid.h + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +libvolk_runtime_la_SOURCES = \ + $(platform_CODE) \ + volk_runtime.c \ + volk_init.c \ + volk_rank_archs.c + +libvolk_la_SOURCES = \ + $(platform_CODE) \ + volk.c \ + volk_environment_init.c + +volk_orc_LDFLAGS = \ + $(ORC_LDFLAGS) \ + -lorc-0.4 + +volk_orc_LIBADD = \ + ../orc/libvolk_orc.la + +if LV_HAVE_ORC +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) +else +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_la_LIBADD = +endif + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +#libvolk_qa_la_SOURCES = \ +# qa_utils.cc + +#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost + +#libvolk_qa_la_LIBADD = \ +# libvolk.la \ +# libvolk_runtime.la + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_utils.h + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + testqa + +testqa_SOURCES = testqa.cc qa_utils.cc +testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) +testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) +if LV_HAVE_ORC +testqa_LDADD = \ + libvolk.la \ + libvolk_runtime.la \ + ../orc/libvolk_orc.la +else +testqa_LDADD = \ + libvolk.la \ + libvolk_runtime.la +endif + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +#SUBDIRS = + +#ifdef BUILD_SSE +#SUBDIRS += sse +#elif BUILD_SPU +#SUBDIRS += spu +#else +#SUBDIRS += port +#endif + + diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h new file mode 100644 index 000000000..2d0916fb3 --- /dev/null +++ b/volk/lib/gcc_x86_cpuid.h @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * <http://www.gnu.org/licenses/>. + */ + +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) + +/* %edx */ +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +/* Extended Features */ +/* %ecx */ +#define bit_LAHF_LM (1 << 0) +#define bit_SSE4a (1 << 6) +#define bit_SSE5 (1 << 11) + +/* %edx */ +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1 << 31) + + +#if defined(__i386__) && defined(__PIC__) +/* %ebx may be the PIC register. */ +#if __GNUC__ >= 3 +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif +#else +#define __cpuid(level, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif + +/* Return highest supported input value for cpuid instruction. ext can + be either 0x0 or 0x8000000 to return highest supported value for + basic or extended cpuid information. Function returns 0 if cpuid + is not supported or whatever cpuid returns in eax register. If sig + pointer is non-null, then first four bytes of the signature + (as found in ebx register) are returned in location pointed by sig. */ + +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ + unsigned int __eax, __ebx, __ecx, __edx; + +#ifndef __x86_64__ +#if __GNUC__ >= 3 + /* See if we can use cpuid. On AMD64 we always can. */ + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#endif + + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; +#endif + + /* Host supports cpuid. Return highest supported cpuid input value. */ + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + + if (__sig) + *__sig = __ebx; + + return __eax; +} + +/* Return cpuid data for requested cpuid level, as found in returned + eax, ebx, ecx and edx registers. The function checks if cpuid is + supported and returns 1 for valid cpuid information or 0 for + unsupported cpuid level. All pointers are required to be non-null. */ + +static __inline int +__get_cpuid (unsigned int __level, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __ext = __level & 0x80000000; + + if (__get_cpuid_max (__ext, 0) < __level) + return 0; + + __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..154aa0f17 --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_16s_add_quad_aligned16.h> +#include <volk/volk_16s_add_quad_aligned16.h> +#include <cstdlib> +#include <ctime> +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + short input4[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + short output2[vlen] __attribute__ ((aligned (16))); + short output3[vlen] __attribute__ ((aligned (16))); + short output01[vlen] __attribute__ ((aligned (16))); + short output11[vlen] __attribute__ ((aligned (16))); + short output21[vlen] __attribute__ ((aligned (16))); + short output31[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..62deffaeb --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include <volk/volk.h> +#include <qa_16s_branch_4_state_8_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short target3[vlen] __attribute__ ((aligned (16))); + + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))) = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + short cntl0[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl1[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl2[vlen] __attribute__ ((aligned (16))) = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + short cntl3[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..819b2256b --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include <volk/volk.h> +#include <qa_16s_permute_and_scalar_add_aligned16.h> +#include <volk/volk_16s_permute_and_scalar_add_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))); + short cntl0[vlen] __attribute__ ((aligned (16))); + short cntl1[vlen] __attribute__ ((aligned (16))); + short cntl2[vlen] __attribute__ ((aligned (16))); + short cntl3[vlen] __attribute__ ((aligned (16))); + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..66f8c9afa --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16s_quad_max_star_aligned16.h> +#include <volk/volk_16s_quad_max_star_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..592304f83 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,61 @@ +#include <volk/volk.h> +#include <qa_32f_fm_detect_aligned16.h> +#include <volk/volk_32f_fm_detect_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..a1c3d4cd1 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..4d83f1639 --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_32fc_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex<float>* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..a3d0955bd --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,64 @@ +#include <volk/volk.h> +#include <qa_32fc_power_spectral_density_32f_aligned16.h> +#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc new file mode 100644 index 000000000..fefdf06ee --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc @@ -0,0 +1,138 @@ +#include <volk/volk.h> +#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h new file mode 100644 index 000000000..f07402403 --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H +#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */ diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc new file mode 100644 index 000000000..618a82a02 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -0,0 +1,62 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32u_popcnt_aligned16.h> +#include <volk/volk_32u_popcnt_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_32u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_32u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint32_t input0 __attribute__ ((aligned (16))); + + uint32_t output0 __attribute__ ((aligned (16))); + uint32_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint32_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("32u_popcnt_aligned\n"); + + start = clock(); + uint32_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_32u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h new file mode 100644 index 000000000..fa1dc1041 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc new file mode 100644 index 000000000..85ef58795 --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -0,0 +1,62 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_64u_popcnt_aligned16.h> +#include <volk/volk_64u_popcnt_aligned16.h> +#include <cstdlib> +#include <ctime> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_64u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_64u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint64_t input0 __attribute__ ((aligned (16))); + + uint64_t output0 __attribute__ ((aligned (16))); + uint64_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint64_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("64u_popcnt_aligned\n"); + + start = clock(); + uint64_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_64u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h new file mode 100644 index 000000000..217822d6e --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc new file mode 100644 index 000000000..b0f63d2b5 --- /dev/null +++ b/volk/lib/qa_utils.cc @@ -0,0 +1,440 @@ +#include "qa_utils.h" +#include <cstring> +#include <boost/foreach.hpp> +#include <boost/assign/list_of.hpp> +#include <boost/tokenizer.hpp> +//#include <boost/test/unit_test.hpp> +#include <iostream> +#include <vector> +#include <list> +#include <ctime> +#include <cmath> +#include <boost/lexical_cast.hpp> +//#include <volk/volk_runtime.h> +#include <volk/volk_registry.h> +#include <volk/volk.h> +#include <boost/typeof/typeof.hpp> +#include <boost/type_traits.hpp> + +float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +template <class t> +void random_floats (t *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +void load_random_data(void *data, volk_type_t type, unsigned int n) { + if(type.is_complex) n *= 2; + if(type.is_float) { + if(type.size == 8) random_floats<double>((double *)data, n); + else random_floats<float>((float *)data, n); + } else { + float int_max = float(uint64_t(2) << (type.size*8)); + if(type.is_signed) int_max /= 2.0; + for(int i=0; i<n; i++) { + float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max; + //man i really don't know how to do this in a more clever way, you have to cast down at some point + switch(type.size) { + case 8: + if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; + else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; + break; + case 4: + if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; + else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; + break; + case 2: + if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; + else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; + break; + case 1: + if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; + else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; + break; + default: + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here + } + } + } +} + +static std::vector<std::string> get_arch_list(const int archs[]) { + std::vector<std::string> archlist; + int num_archs = archs[0]; + + //there has got to be a way to query these arches + for(int i = 0; i < num_archs; i++) { + switch(archs[i+1]) { + case (1<<LV_GENERIC): + archlist.push_back("generic"); + break; + case (1<<LV_ORC): + archlist.push_back("orc"); + break; + case (1<<LV_SSE): + archlist.push_back("sse"); + break; + case (1<<LV_SSE2): + archlist.push_back("sse2"); + break; + case (1<<LV_SSE3): + archlist.push_back("sse3"); + break; + case (1<<LV_SSSE3): + archlist.push_back("ssse3"); + break; + case (1<<LV_SSE4_1): + archlist.push_back("sse4_1"); + break; + case (1<<LV_SSE4_2): + archlist.push_back("sse4_2"); + break; + case (1<<LV_SSE4_A): + archlist.push_back("sse4_a"); + break; + case (1<<LV_MMX): + archlist.push_back("mmx"); + break; + case (1<<LV_AVX): + archlist.push_back("avx"); + break; + default: + break; + } + } + return archlist; +} + +volk_type_t volk_type_from_string(std::string name) { + volk_type_t type; + type.is_float = false; + type.is_scalar = false; + type.is_complex = false; + type.is_signed = false; + type.size = 0; + type.str = name; + + if(name.size() < 2) throw std::string("name too short to be a datatype"); + + //is it a scalar? + if(name[0] == 's') { + type.is_scalar = true; + name = name.substr(1, name.size()-1); + } + + //get the data size + int last_size_pos = name.find_last_of("0123456789"); + if(last_size_pos < 0) throw std::string("no size spec in type ").append(name); + //will throw if malformed + int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1)); + + assert(((size % 8) == 0) && (size <= 64) && (size != 0)); + type.size = size/8; //in bytes + + for(int i=last_size_pos+1; i < name.size(); i++) { + switch (name[i]) { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } + } + + return type; +} + +static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, + std::vector<volk_type_t> &outputsig, + std::string name) { + boost::char_separator<char> sep("_"); + boost::tokenizer<boost::char_separator<char> > tok(name, sep); + std::vector<std::string> toked; + tok.assign(name); + toked.assign(tok.begin(), tok.end()); + + assert(toked[0] == "volk"); + toked.erase(toked.begin()); + + //ok. we're assuming a string in the form + //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) + + enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; + std::string fn_name; + volk_type_t type; + BOOST_FOREACH(std::string token, toked) { + try { + type = volk_type_from_string(token); + if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + + if(side == SIDE_INPUT) inputsig.push_back(type); + else outputsig.push_back(type); + } catch (...){ + if(token[0] == 'x') { //it's a multiplier + if(side == SIDE_INPUT) assert(inputsig.size() > 0); + else assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid + for(int i=1; i<multiplier; i++) { + if(side == SIDE_INPUT) inputsig.push_back(inputsig.back()); + else outputsig.push_back(outputsig.back()); + } + } + else if(side == SIDE_INPUT) { //it's the function name, at least it better be + side = SIDE_NAME; + fn_name.append("_"); + fn_name.append(token); + } + else if(side == SIDE_OUTPUT) { + if(token != toked.back()) throw; //the last token in the name is the alignment + } + } + } + //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! + assert(inputsig.size() != 0); +} + +inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], vlen, arch.c_str()); +} + +inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +} + +inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +} + +inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +} + +inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +template <class t> +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; + for(int i=0; i<vlen; i++) { + if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision + if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; + } + } + } + + return fail; +} + +template <class t> +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { + bool fail = false; + int print_max_errs = 10; + for(int i=0; i<vlen; i++) { + if(abs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl; + } + } + } + + return fail; +} + +class volk_qa_aligned_mem_pool{ +public: + void *get_new(size_t size, size_t alignment = 16){ + _mems.push_back(std::vector<char>(size + alignment-1, 0)); + size_t ptr = size_t(&_mems.back().front()); + return (void *)((ptr + alignment-1) & ~(alignment-1)); + } +private: std::list<std::vector<char> > _mems; +}; + +bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) { + std::cout << "RUN_VOLK_TESTS: " << name << std::endl; + + //first let's get a list of available architectures for the test + std::vector<std::string> arch_list = get_arch_list(archs); + + if(arch_list.size() < 2) { + std::cout << "no architectures to test" << std::endl; + return false; + } + + //something that can hang onto memory and cleanup when this function exits + volk_qa_aligned_mem_pool mem_pool; + + //now we have to get a function signature by parsing the name + std::vector<volk_type_t> inputsig, outputsig; + get_signatures_from_name(inputsig, outputsig, name); + + //pull the input scalars into their own vector + std::vector<volk_type_t> inputsc; + for(int i=0; i<inputsig.size(); i++) { + if(inputsig[i].is_scalar) { + inputsc.push_back(inputsig[i]); + inputsig.erase(inputsig.begin() + i); + } + } + + //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl; + //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl; + std::vector<void *> inbuffs; + BOOST_FOREACH(volk_type_t sig, inputsig) { + if(!sig.is_scalar) //we don't make buffers for scalars + inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); + } + for(int i=0; i<inbuffs.size(); i++) { + load_random_data(inbuffs[i], inputsig[i], vlen); + } + + //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch + std::vector<std::vector<void *> > test_data; + for(int i=0; i<arch_list.size(); i++) { + std::vector<void *> arch_buffs; + for(int j=0; j<outputsig.size(); j++) { + arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1))); + } + for(int j=0; j<inputsig.size(); j++) { + arch_buffs.push_back(inbuffs[j]); + } + test_data.push_back(arch_buffs); + } + + std::vector<volk_type_t> both_sigs; + both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); + both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); + + //now run the test + clock_t start, end; + for(int i = 0; i < arch_list.size(); i++) { + start = clock(); + + switch(both_sigs.size()) { + case 1: + if(inputsc.size() == 0) { + run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if(inputsc.size() == 0) { + run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if(inputsc.size() == 0) { + run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; + default: + throw "no function handler for this signature"; + break; + } + + end = clock(); + std::cout << arch_list[i] << " completed in " << (double)(end-start)/(double)CLOCKS_PER_SEC << "s" << std::endl; + } + //and now compare each output to the generic output + //first we have to know which output is the generic one, they aren't in order... + int generic_offset=0; + for(int i=0; i<arch_list.size(); i++) + if(arch_list[i] == "generic") generic_offset=i; + + //now compare + //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know + + bool fail = false; + bool fail_global = false; + for(int i=0; i<arch_list.size(); i++) { + if(i != generic_offset) { + for(int j=0; j<both_sigs.size(); j++) { + if(both_sigs[j].is_float) { + if(both_sigs[j].size == 8) { + fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + } else { + //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ + switch(both_sigs[j].size) { + case 8: + if(both_sigs[j].is_signed) { + fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 4: + if(both_sigs[j].is_signed) { + fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 2: + if(both_sigs[j].is_signed) { + fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + case 1: + if(both_sigs[j].is_signed) { + fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } else { + fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol); + } + break; + default: + fail=1; + } + } + if(fail) { + fail_global = true; + std::cout << name << ": fail on arch " << arch_list[i] << std::endl; + } + //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1)); + } + } + } + + return fail_global; +} + + diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h new file mode 100644 index 000000000..1b64bacaa --- /dev/null +++ b/volk/lib/qa_utils.h @@ -0,0 +1,33 @@ +#ifndef VOLK_QA_UTILS_H +#define VOLK_QA_UTILS_H + +#include <cstdlib> +#include <string> + +struct volk_type_t { + bool is_float; + bool is_scalar; + bool is_signed; + bool is_complex; + int size; + std::string str; +}; + +volk_type_t volk_type_from_string(std::string); + +float uniform(void); +void random_floats(float *buf, unsigned n); + +bool run_volk_tests(const int[], void(*)(), std::string, float, float, int, int); + +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter), 0) + +typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place +typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); +typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); + +#endif //VOLK_QA_UTILS_H diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc new file mode 100644 index 000000000..779bc61eb --- /dev/null +++ b/volk/lib/testqa.cc @@ -0,0 +1,100 @@ +#include "qa_utils.h" +#include <volk/volk.h> +#include <volk/volk_registry.h> +#include <boost/test/unit_test.hpp> + +BOOST_AUTO_TEST_CASE(volk_test_all) { + //in order... +// VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000); +// VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16u_byteswap_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a16, 1e-4, 0, 2046, 1000); + VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 128, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 2046, 10000); +// VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 10, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 4, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32u_byteswap_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_32u_popcnt_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64u_byteswap_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_64u_popcnt_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 0, 256, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000); + +} diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c new file mode 100644 index 000000000..b1a93db26 --- /dev/null +++ b/volk/lib/volk_rank_archs.c @@ -0,0 +1,13 @@ +#include<volk_rank_archs.h> +#include<stdio.h> + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) { + int i = 2; + unsigned int best_val = 0; + for(; i < arch_defs[0] + 1; ++i) { + if((arch_defs[i]&(!arch)) == 0) { + best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + } + } + return best_val; +} diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h new file mode 100644 index 000000000..26b9f7503 --- /dev/null +++ b/volk/lib/volk_rank_archs.h @@ -0,0 +1,14 @@ +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch); + + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ |