diff options
Diffstat (limited to 'volk/lib')
183 files changed, 9136 insertions, 0 deletions
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am new file mode 100644 index 000000000..97eb75680 --- /dev/null +++ b/volk/lib/Makefile.am @@ -0,0 +1,361 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS) + + +# We build 2 libraries and 1 executable here. One library contains +# everything except the libcppunit QA code, and one contains only the +# libcppunit-based QA code. The C++ QA code is especially recommended +# when you have general purpose C or C++ code that may not get +# thoroughly exercised by building and running a GR block. The +# executable runs the QA code at "make check" time. +# +# N.B., If there's a SWIG generated shared library and associated +# python code, it will be contained in ../python, not here. (That +# code is conditionally built depending on the state of the +# --without-python configure option.) However, the .i should be here +# next to the .h that it's based on. + + +# list of programs run by "make check" and "make distcheck" +TESTS = test_all + + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la \ + libvolk_qa.la + + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +universal_runtime_CODE = \ + volk_runtime.c \ + volk_init.c \ + volk_rank_archs.c + +universal_CODE = \ + volk.c \ + volk_environment_init.c + +generic_CODE = \ + volk_cpu_generic.cc + +x86_CODE = \ + volk_cpu_x86.c + +x86_SUBCODE = \ + cpuid_x86.S + +x86_64_SUBCODE = \ + cpuid_x86_64.S + +powerpc_CODE = \ + volk_cpu_powerpc.cc + + +if MD_CPU_generic +libvolk_la_SOURCES = \ + $(generic_CODE) \ + $(universal_CODE) +libvolk_runtime_la_SOURCES = \ + $(generic_CODE) \ + $(universal_runtime_CODE) + +endif + +if MD_CPU_x86 +if MD_SUBCPU_x86_64 +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_runtime_CODE) +else +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_runtime_CODE) +endif +endif + + +if MD_CPU_powerpc +libvolk_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_runtime_CODE) +endif + + + +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_la_LIBADD = + + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +libvolk_qa_la_SOURCES = \ + qa_volk.cc \ + qa_16s_quad_max_star_aligned16.cc \ + qa_32fc_dot_prod_aligned16.cc \ + qa_32fc_square_dist_aligned16.cc \ + qa_32fc_square_dist_scalar_mult_aligned16.cc \ + qa_32f_sum_of_poly_aligned16.cc \ + qa_32fc_index_max_aligned16.cc \ + qa_32f_index_max_aligned16.cc \ + qa_32fc_conjugate_dot_prod_aligned16.cc \ + qa_16s_permute_and_scalar_add_aligned16.cc \ + qa_16s_branch_4_state_8_aligned16.cc \ + qa_16s_max_star_horizontal_aligned16.cc \ + qa_16s_max_star_aligned16.cc \ + qa_16s_add_quad_aligned16.cc \ + qa_32f_add_aligned16.cc \ + qa_32f_subtract_aligned16.cc \ + qa_32f_max_aligned16.cc \ + qa_32f_min_aligned16.cc \ + qa_64f_max_aligned16.cc \ + qa_64f_min_aligned16.cc \ + qa_32s_and_aligned16.cc \ + qa_32s_or_aligned16.cc \ + qa_32f_dot_prod_aligned16.cc \ + qa_32f_dot_prod_unaligned16.cc \ + qa_32f_fm_detect_aligned16.cc \ + qa_32fc_32f_multiply_aligned16.cc \ + qa_32fc_multiply_aligned16.cc \ + qa_32f_divide_aligned16.cc \ + qa_32f_multiply_aligned16.cc \ + qa_32f_sqrt_aligned16.cc \ + qa_8sc_multiply_conjugate_16sc_aligned16.cc \ + qa_8sc_multiply_conjugate_32fc_aligned16.cc \ + qa_32u_popcnt_aligned16.cc \ + qa_64u_popcnt_aligned16.cc \ + qa_64u_byteswap_aligned16.cc \ + qa_8sc_deinterleave_32f_aligned16.cc \ + qa_16sc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_16s_aligned16.cc \ + qa_32f_interleave_32fc_aligned16.cc \ + qa_16u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_16s_aligned16.cc \ + qa_32fc_deinterleave_real_32f_aligned16.cc \ + qa_32fc_magnitude_32f_aligned16.cc \ + qa_32fc_deinterleave_real_64f_aligned16.cc \ + qa_32fc_deinterleave_real_16s_aligned16.cc \ + qa_32fc_magnitude_16s_aligned16.cc \ + qa_32fc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_real_8s_aligned16.cc \ + qa_32fc_deinterleave_64f_aligned16.cc \ + qa_32f_interleave_16sc_aligned16.cc \ + qa_16sc_deinterleave_real_8s_aligned16.cc \ + qa_16sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_32f_aligned16.cc \ + qa_32u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_real_16s_aligned16.cc \ + qa_8sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_16s_aligned16.cc \ + qa_32f_normalize_aligned16.cc \ + qa_8sc_deinterleave_real_16s_aligned16.cc \ + qa_16s_convert_32f_aligned16.cc \ + qa_16s_convert_32f_unaligned16.cc \ + qa_16s_convert_8s_aligned16.cc \ + qa_16s_convert_8s_unaligned16.cc \ + qa_32f_convert_16s_aligned16.cc \ + qa_32f_convert_16s_unaligned16.cc \ + qa_32f_convert_32s_aligned16.cc \ + qa_32f_convert_32s_unaligned16.cc \ + qa_32f_convert_64f_aligned16.cc \ + qa_32f_convert_64f_unaligned16.cc \ + qa_32f_convert_8s_aligned16.cc \ + qa_32f_convert_8s_unaligned16.cc \ + qa_32s_convert_32f_aligned16.cc \ + qa_32s_convert_32f_unaligned16.cc \ + qa_64f_convert_32f_aligned16.cc \ + qa_64f_convert_32f_unaligned16.cc \ + qa_8s_convert_16s_aligned16.cc \ + qa_8s_convert_16s_unaligned16.cc \ + qa_8s_convert_32f_aligned16.cc \ + qa_8s_convert_32f_unaligned16.cc \ + qa_32fc_32f_power_32fc_aligned16.cc \ + qa_32f_power_aligned16.cc \ + qa_32fc_atan2_32f_aligned16.cc \ + qa_32fc_power_spectral_density_32f_aligned16.cc \ + qa_32fc_power_spectrum_32f_aligned16.cc \ + qa_32f_calc_spectral_noise_floor_aligned16.cc \ + qa_32f_accumulator_aligned16.cc \ + qa_32f_stddev_aligned16.cc \ + qa_32f_stddev_and_mean_aligned16.cc + +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_qa_la_LIBADD = \ + libvolk.la \ + libvolk_runtime.la \ + $(CPPUNIT_LIBS) + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_volk.h \ + qa_16s_quad_max_star_aligned16.h \ + qa_32fc_dot_prod_aligned16.h \ + qa_32fc_square_dist_aligned16.h \ + qa_32fc_square_dist_scalar_mult_aligned16.h \ + qa_32f_sum_of_poly_aligned16.h \ + qa_32fc_index_max_aligned16.h \ + qa_32f_index_max_aligned16.h \ + qa_32fc_conjugate_dot_prod_aligned16.h \ + qa_16s_permute_and_scalar_add_aligned16.h \ + qa_16s_branch_4_state_8_aligned16.h \ + qa_16s_max_star_horizontal_aligned16.h \ + qa_16s_max_star_aligned16.h \ + qa_16s_add_quad_aligned16.h \ + qa_32f_add_aligned16.h \ + qa_32f_subtract_aligned16.h \ + qa_32f_max_aligned16.h \ + qa_32f_min_aligned16.h \ + qa_64f_max_aligned16.h \ + qa_64f_min_aligned16.h \ + qa_32s_and_aligned16.h \ + qa_32s_or_aligned16.h \ + qa_32f_dot_prod_aligned16.h \ + qa_32f_dot_prod_unaligned16.h \ + qa_32f_fm_detect_aligned16.h \ + qa_32fc_32f_multiply_aligned16.h \ + qa_32fc_multiply_aligned16.h \ + qa_32f_divide_aligned16.h \ + qa_32f_multiply_aligned16.h \ + qa_32f_sqrt_aligned16.h \ + qa_8sc_multiply_conjugate_16sc_aligned16.h \ + qa_8sc_multiply_conjugate_32fc_aligned16.h \ + qa_32u_popcnt_aligned16.h \ + qa_64u_popcnt_aligned16.h \ + qa_64u_byteswap_aligned16.h \ + qa_8sc_deinterleave_32f_aligned16.h \ + qa_16sc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_16s_aligned16.h \ + qa_32f_interleave_32fc_aligned16.h \ + qa_16u_byteswap_aligned16.h \ + qa_16sc_deinterleave_16s_aligned16.h \ + qa_32fc_deinterleave_real_32f_aligned16.h \ + qa_32fc_magnitude_32f_aligned16.h \ + qa_32fc_deinterleave_real_64f_aligned16.h \ + qa_32fc_deinterleave_real_16s_aligned16.h \ + qa_32fc_magnitude_16s_aligned16.h \ + qa_32fc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_real_8s_aligned16.h \ + qa_32fc_deinterleave_64f_aligned16.h \ + qa_32f_interleave_16sc_aligned16.h \ + qa_16sc_deinterleave_real_8s_aligned16.h \ + qa_16sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_32f_aligned16.h \ + qa_32u_byteswap_aligned16.h \ + qa_16sc_deinterleave_real_16s_aligned16.h \ + qa_8sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_16s_aligned16.h \ + qa_32f_normalize_aligned16.h \ + qa_8sc_deinterleave_real_16s_aligned16.h \ + qa_16s_convert_32f_aligned16.h \ + qa_16s_convert_32f_unaligned16.h \ + qa_16s_convert_8s_aligned16.h \ + qa_16s_convert_8s_unaligned16.h \ + qa_32f_convert_16s_aligned16.h \ + qa_32f_convert_16s_unaligned16.h \ + qa_32f_convert_32s_aligned16.h \ + qa_32f_convert_32s_unaligned16.h \ + qa_32f_convert_64f_aligned16.h \ + qa_32f_convert_64f_unaligned16.h \ + qa_32f_convert_8s_aligned16.h \ + qa_32f_convert_8s_unaligned16.h \ + qa_32s_convert_32f_aligned16.h \ + qa_32s_convert_32f_unaligned16.h \ + qa_64f_convert_32f_aligned16.h \ + qa_64f_convert_32f_unaligned16.h \ + qa_8s_convert_16s_aligned16.h \ + qa_8s_convert_16s_unaligned16.h \ + qa_8s_convert_32f_aligned16.h \ + qa_8s_convert_32f_unaligned16.h \ + qa_32fc_32f_power_32fc_aligned16.h \ + qa_32f_power_aligned16.h \ + qa_32fc_atan2_32f_aligned16.h \ + qa_32fc_power_spectral_density_32f_aligned16.h \ + qa_32fc_power_spectrum_32f_aligned16.h \ + qa_32f_calc_spectral_noise_floor_aligned16.h \ + qa_32f_accumulator_aligned16.h \ + qa_32f_stddev_aligned16.h \ + qa_32f_stddev_and_mean_aligned16.h + + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + test_all + +test_all_SOURCES = test_all.cc +test_all_LDADD = libvolk_qa.la + + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +#SUBDIRS = + +#ifdef BUILD_SSE +#SUBDIRS += sse +#elif BUILD_SPU +#SUBDIRS += spu +#else +#SUBDIRS += port +#endif + + diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h new file mode 100644 index 000000000..8a99aa07c --- /dev/null +++ b/volk/lib/assembly.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef _ASSEMBLY_H_ +#define _ASSEMBLY_H_ + +#if defined (__APPLE__) && defined (__APPLE_CC__) + +// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3, +// but creates an error in XCode 2.4. Just ignore them. + +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) /* none */ + +#define FUNC_TAIL(f) /* none*/ + +#elif !defined (__ELF__) + +/* + * Too bad, the following define does not work as expected --SF + * #define GLOB_SYMB(f) __USER_LABEL_PREFIX__ ## f + */ +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) \ + .def GLOB_SYMB(f); .scl 2; .type 32; .endef + +#define FUNC_TAIL(f) /* none */ + + +#else /* !__ELF__ */ + + +#define GLOB_SYMB(f) f + +#define DEF_FUNC_HEAD(f) \ + .type GLOB_SYMB(f),@function \ + +#define FUNC_TAIL(f) \ + .Lfe1: \ + .size GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f) + + +#endif /* !__ELF__ */ + + +#endif /* _ASSEMBLY_H_ */ diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S new file mode 100644 index 000000000..4e1a9404f --- /dev/null +++ b/volk/lib/cpuid_x86.S @@ -0,0 +1,60 @@ +# +# Copyright 2003 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + pushl %ebp + movl %esp, %ebp + pushl %ebx # must save in PIC mode, holds GOT pointer + pushl %esi + + movl 8(%ebp), %eax # op + movl 12(%ebp), %esi # result + cpuid + movl %eax, 0(%esi) + movl %ebx, 4(%esi) + movl %ecx, 8(%esi) + movl %edx, 12(%esi) + + popl %esi + popl %ebx + popl %ebp + ret + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S new file mode 100644 index 000000000..32b1847cd --- /dev/null +++ b/volk/lib/cpuid_x86_64.S @@ -0,0 +1,54 @@ +# +# Copyright 2003,2005 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86_64.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + mov %rbx, %r11 # must save in PIC mode, holds GOT pointer + + mov %rdi, %rax # op + cpuid + movl %eax, 0(%rsi) # result + movl %ebx, 4(%rsi) + movl %ecx, 8(%rsi) + movl %edx, 12(%rsi) + + mov %r11, %rbx + retq + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid64 assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..c3005c1be --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_16s_add_quad_aligned16.h> +#include <volk/volk_16s_add_quad_aligned16.h> +#include <cstdlib> +#include <time.h> +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + short input4[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + short output2[vlen] __attribute__ ((aligned (16))); + short output3[vlen] __attribute__ ((aligned (16))); + short output01[vlen] __attribute__ ((aligned (16))); + short output11[vlen] __attribute__ ((aligned (16))); + short output21[vlen] __attribute__ ((aligned (16))); + short output31[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..ba5e8ed93 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include <volk/volk.h> +#include <qa_16s_branch_4_state_8_aligned16.h> +#include <cstdlib> +#include <time.h> + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short target3[vlen] __attribute__ ((aligned (16))); + + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))) = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + short cntl0[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl1[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl2[vlen] __attribute__ ((aligned (16))) = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + short cntl3[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc new file mode 100644 index 000000000..7878d4737 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.cc @@ -0,0 +1,73 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_16s_convert_32f_aligned16.h> +#include <volk/volk_16s_convert_32f_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h new file mode 100644 index 000000000..ef813d96f --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..8c3121e5c --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc @@ -0,0 +1,73 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_16s_convert_32f_unaligned16.h> +#include <volk/volk_16s_convert_32f_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_unaligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aeb04f770 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc new file mode 100644 index 000000000..734b7784e --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16s_convert_8s_aligned16.h> +#include <volk/volk_16s_convert_8s_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h new file mode 100644 index 000000000..2e409d0cc --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc new file mode 100644 index 000000000..275ab7668 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16s_convert_8s_unaligned16.h> +#include <volk/volk_16s_convert_8s_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h new file mode 100644 index 000000000..4b2fe9e42 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc new file mode 100644 index 000000000..b46b9ae8e --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.cc @@ -0,0 +1,65 @@ +#include <volk/volk.h> +#include <qa_16s_max_star_aligned16.h> +#include <volk/volk_16s_max_star_aligned16.h> +#include <cstdlib> +#include <time.h> +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + + +void qa_16s_max_star_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 6400; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[1] __attribute__ ((aligned (16))); + + short output1[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h new file mode 100644 index 000000000..119f87c4d --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc new file mode 100644 index 000000000..4d44735df --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc @@ -0,0 +1,79 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_16s_max_star_horizontal_aligned16.h> +#include <volk/volk_16s_max_star_horizontal_aligned16.h> +#include <cstdlib> +#include <time.h> +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_horizontal_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + +void qa_16s_max_star_horizontal_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 32; + const int ITERS = 1; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[vlen>>1] __attribute__ ((aligned (16))); + + short output1[vlen>>1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))); + + short minus0 = ((short) (rand() - (RAND_MAX/2))); + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_horizontal_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + /* volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/ + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < (vlen >> 1); ++i) { + // printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + + } + for(int i = 0; i < (vlen >> 1); ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } + } + + +#endif + diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h new file mode 100644 index 000000000..9f9757253 --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..3c4f5c6cc --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include <volk/volk.h> +#include <qa_16s_permute_and_scalar_add_aligned16.h> +#include <volk/volk_16s_permute_and_scalar_add_aligned16.h> +#include <cstdlib> +#include <time.h> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))); + short cntl0[vlen] __attribute__ ((aligned (16))); + short cntl1[vlen] __attribute__ ((aligned (16))); + short cntl2[vlen] __attribute__ ((aligned (16))); + short cntl3[vlen] __attribute__ ((aligned (16))); + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..80a220c93 --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,59 @@ +#include <volk/volk.h> +#include <qa_16s_quad_max_star_aligned16.h> +#include <volk/volk_16s_quad_max_star_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..e700ac72c --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,76 @@ +#include <volk/volk.h> +#include <qa_16sc_deinterleave_16s_aligned16.h> +#include <volk/volk_16sc_deinterleave_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_sse21[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse31[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse21[i]); + + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..995ab5b34 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..6ee076998 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk.h> +#include <qa_16sc_deinterleave_32f_aligned16.h> +#include <volk/volk_16sc_deinterleave_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16sc_deinterleave_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + float output_sse21[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..fea3b6c2d --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..ca048ea67 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,71 @@ +#include <volk/volk.h> +#include <qa_16sc_deinterleave_real_16s_aligned16.h> +#include <volk/volk_16sc_deinterleave_real_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..ebb70b97a --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..0f4ba6923 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,123 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_16sc_deinterleave_real_32f_aligned16.h> +#include <volk/volk_16sc_deinterleave_real_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE */ + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE4_1 */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..e83426473 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..5ab458bc9 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16sc_deinterleave_real_8s_aligned16.h> +#include <volk/volk_16sc_deinterleave_real_8s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..04e5511e5 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..b14610757 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_16sc_magnitude_16s_aligned16.h> +#include <volk/volk_16sc_magnitude_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..4664b70f4 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..06dff2fd5 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_16sc_magnitude_32f_aligned16.h> +#include <volk/volk_16sc_magnitude_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* inputLoad = (int16_t*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("16sc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..0c25673ea --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc new file mode 100644 index 000000000..6b19828a4 --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_16u_byteswap_aligned16.h> +#include <volk/volk_16u_byteswap_aligned16.h> +#include <cstdlib> +#include <cstring> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint16_t output0[vlen] __attribute__ ((aligned (16))); + uint16_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint16_t)); + + printf("16u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h new file mode 100644 index 000000000..e11b23e3f --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_16u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc new file mode 100644 index 000000000..ea637d600 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.cc @@ -0,0 +1,56 @@ +#include <volk/volk.h> +#include <qa_32f_accumulator_aligned16.h> +#include <volk/volk_32f_accumulator_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_accumulator_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_accumulator_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float accumulator_generic; + float accumulator_sse; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_accumulator_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4); +} + +#endif diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h new file mode 100644 index 000000000..0004d3ff0 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H +#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_accumulator_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc new file mode 100644 index 000000000..92f35c7ec --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_add_aligned16.h> +#include <volk/volk_32f_add_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_add_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_add_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_add_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h new file mode 100644 index 000000000..58e2a151c --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H +#define INCLUDED_QA_32F_ADD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc new file mode 100644 index 000000000..3c8137004 --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc @@ -0,0 +1,59 @@ +#include <volk/volk.h> +#include <qa_32f_calc_spectral_noise_floor_aligned16.h> +#include <volk/volk_32f_calc_spectral_noise_floor_aligned16.h> +#include <cstdlib> +#include <math.h> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[1] __attribute__ ((aligned (16))); + float output01[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_calc_spectral_noise_floor_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h new file mode 100644 index 000000000..c5dce2c4b --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H +#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc new file mode 100644 index 000000000..84a4c40c4 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_16s_aligned16.h> +#include <volk/volk_32f_convert_16s_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h new file mode 100644 index 000000000..fce1eb417 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc new file mode 100644 index 000000000..9469daed2 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_16s_unaligned16.h> +#include <volk/volk_32f_convert_16s_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h new file mode 100644 index 000000000..492bc80e6 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc new file mode 100644 index 000000000..ff24c7b0d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_32s_aligned16.h> +#include <volk/volk_32f_convert_32s_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_32s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h new file mode 100644 index 000000000..97d854463 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc new file mode 100644 index 000000000..e63b17994 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_32s_unaligned16.h> +#include <volk/volk_32f_convert_32s_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_32s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h new file mode 100644 index 000000000..5d662d86d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc new file mode 100644 index 000000000..c546e47de --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_convert_64f_aligned16.h> +#include <volk/volk_32f_convert_64f_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h new file mode 100644 index 000000000..41eb3e094 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc new file mode 100644 index 000000000..24b51f9af --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_convert_64f_unaligned16.h> +#include <volk/volk_32f_convert_64f_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_64f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h new file mode 100644 index 000000000..4b144f033 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc new file mode 100644 index 000000000..a3d4d6567 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_8s_aligned16.h> +#include <volk/volk_32f_convert_8s_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h new file mode 100644 index 000000000..68a523f34 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc new file mode 100644 index 000000000..d885fd6bb --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32f_convert_8s_unaligned16.h> +#include <volk/volk_32f_convert_8s_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h new file mode 100644 index 000000000..88d4ff42a --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc new file mode 100644 index 000000000..b20999beb --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_divide_aligned16.h> +#include <volk/volk_32f_divide_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_divide_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_divide_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_divide_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h new file mode 100644 index 000000000..79d5ae4b8 --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H +#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_divide_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc new file mode 100644 index 000000000..98c1f2d99 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.cc @@ -0,0 +1,183 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_dot_prod_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_aligned16::t1() { + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h new file mode 100644 index 000000000..6931a9e98 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc new file mode 100644 index 000000000..8e97d4249 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.cc @@ -0,0 +1,190 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_dot_prod_unaligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_unaligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + input = &input[1]; // Make sure the buffer is unaligned + taps = &taps[1]; // Make sure the buffer is unaligned + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(&input[-1]); + free(&taps[-1]); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h new file mode 100644 index 000000000..e8bad07fe --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..ca65add28 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_fm_detect_aligned16.h> +#include <volk/volk_32f_fm_detect_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..a1c3d4cd1 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc new file mode 100644 index 000000000..2a937637f --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc @@ -0,0 +1,75 @@ +#include <volk/volk.h> +#include <qa_32f_interleave_16sc_aligned16.h> +#include <volk/volk_32f_interleave_16sc_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32f_interleave_16sc_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_16sc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex<int16_t> output_generic[vlen] __attribute__ ((aligned (16))); + std::complex<int16_t> output_sse[vlen] __attribute__ ((aligned (16))); + std::complex<int16_t> output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_interleave_16sc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h new file mode 100644 index 000000000..8d2914817 --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc new file mode 100644 index 000000000..c22dd1046 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc @@ -0,0 +1,62 @@ +#include <volk/volk.h> +#include <qa_32f_interleave_32fc_aligned16.h> +#include <volk/volk_32f_interleave_32fc_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_interleave_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_32fc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex<float> output_generic[vlen] __attribute__ ((aligned (16))); + std::complex<float> output_sse[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_interleave_32fc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h new file mode 100644 index 000000000..cba518d37 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc new file mode 100644 index 000000000..3ef375176 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_max_aligned16.h> +#include <volk/volk_32f_max_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_max_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h new file mode 100644 index 000000000..d535479f4 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc new file mode 100644 index 000000000..617e18b24 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_min_aligned16.h> +#include <volk/volk_32f_min_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_min_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h new file mode 100644 index 000000000..90961ac92 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H +#define INCLUDED_QA_32F_MIN_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc new file mode 100644 index 000000000..c77fe97da --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_multiply_aligned16.h> +#include <volk/volk_32f_multiply_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_multiply_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_multiply_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_multiply_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h new file mode 100644 index 000000000..7032a2ad4 --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc new file mode 100644 index 000000000..2954fc3ae --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -0,0 +1,65 @@ +#include <volk/volk.h> +#include <qa_32f_normalize_aligned16.h> +#include <volk/volk_32f_normalize_aligned16.h> +#include <cstdlib> +#include <cstring> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_normalize_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_normalize_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 320001; + const int ITERS = 100; + + float* output0; + float* output01; + ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); + ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); + + for(int i = 0; i < vlen; ++i) { + output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(float)); + printf("32f_normalize_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + // printf("%e...%e\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } + + free(output0); + free(output01); +} + +#endif diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h new file mode 100644 index 000000000..7c421eb82 --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H +#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_normalize_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc new file mode 100644 index 000000000..1b331daeb --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.cc @@ -0,0 +1,95 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_power_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32f_power_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float* input; + int i; + + float* result_generic; + float* result_sse; + float* result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float)); + + random_floats((float*)input, vlen); + + const float power = 3; + + printf("32f_power_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + + for(i = 0; i < vlen; i++){ + //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32f_power_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h new file mode 100644 index 000000000..d45df4e56 --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H +#define INCLUDED_QA_32F_POWER_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_power_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_power_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc new file mode 100644 index 000000000..a3e6abc18 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -0,0 +1,59 @@ +#include <volk/volk.h> +#include <qa_32f_sqrt_aligned16.h> +#include <volk/volk_32f_sqrt_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_sqrt_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_sqrt_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + // No reason to test negative numbers because they result in NaN. + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand()) / static_cast<float>(RAND_MAX)); + } + printf("32f_sqrt_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h new file mode 100644 index 000000000..e4b99d981 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H +#define INCLUDED_QA_32F_SQRT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_sqrt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc new file mode 100644 index 000000000..c0f22cdea --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.cc @@ -0,0 +1,74 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_stddev_aligned16.h> +#include <volk/volk_32f_stddev_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean = 0; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + mean += input0[i]; + } + mean /= static_cast<float>(vlen); + + printf("32f_stddev_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h new file mode 100644 index 000000000..7f8d7a5fc --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_stddev_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc new file mode 100644 index 000000000..dcad8bcf3 --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc @@ -0,0 +1,75 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32f_stddev_and_mean_aligned16.h> +#include <volk/volk_32f_stddev_and_mean_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_and_mean_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_and_mean_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean_generic; + float mean_sse; + float mean_sse4_1; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_stddev_and_mean_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h new file mode 100644 index 000000000..e08bd249a --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc new file mode 100644 index 000000000..a7e1b5ae3 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32f_subtract_aligned16.h> +#include <volk/volk_32f_subtract_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_subtract_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_subtract_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); + } + printf("32f_subtract_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h new file mode 100644 index 000000000..97c14f129 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H +#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_subtract_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc new file mode 100644 index 000000000..494776357 --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.cc @@ -0,0 +1,142 @@ +#include <volk/volk.h> +#include <qa_32f_sum_of_poly_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <math.h> + +#define SNR 30.0 +#define CENTER -4.0 +#define CUTOFF -5.595 +#define ERR_DELTA (1e-4) +#define NUM_ITERS 100000 +#define VEC_LEN 64 +static float uniform() { + return ((float) rand() / RAND_MAX); // uniformly (0, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * -SNR/2.0; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32f_sum_of_poly_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32f_sum_of_poly_aligned16::t1(){ + int i = 0; + + volk_environment_init(); + int ret; + + const int vlen = VEC_LEN; + float cutoff = CUTOFF; + + float* center_point_array; + float* target; + float* target_generic; + float* src0 ; + + + ret = posix_memalign((void**)¢er_point_array, 16, 24); + ret = posix_memalign((void**)&target, 16, 4); + ret = posix_memalign((void**)&target_generic, 16, 4); + ret = posix_memalign((void**)&src0, 16, (vlen << 2)); + + + random_floats((float*)src0, vlen); + + float a = (float)CENTER; + float etoa = expf(a); + center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 + + (-4.0 * a * a * a)/24.0 + + (3.0 * a * a)/6.0 + + (-2.0 * a)/2.0 + + (1.0)) * etoa; + center_point_array[1] = (//(-10.0 * a * a * a)/120.0 + + (6.0 * a * a)/24.0 + + (-3.0 * a)/6.0 + + (1.0/2.0)) * etoa; + center_point_array[2] = (//(10.0 * a * a)/120.0 + + (-4.0 * a)/24.0 + + (1.0/6.0)) * etoa; + center_point_array[3] = (//(-5.0 * a)/120.0 + + (1.0/24.0)) * etoa; + //center_point_array[4] = ((1.0)/120.0) * etoa; + center_point_array[4] = (//(a * a * a * a * a)/120.0 + + (a * a * a * a)/24.0 + + (a * a * a)/-6.0 + + (a * a)/2.0 + + -a + 1.0) * etoa; + + printf("32f_sum_of_poly_aligned16\n"); + + clock_t start, end; + double total; + + float my_sum = 0.0; + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + float sum = 0.0; + for(int l = 0; l < vlen; ++l) { + + sum += expf(src0[l]); + + } + my_sum = sum; + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("exp time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + + volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic"); + + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 approx time: %f\n", total); + + + + printf("exp: %f, sse3: %f\n", my_sum, target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA); + + + free(center_point_array); + free(target); + free(target_generic); + free(src0); + + +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h new file mode 100644 index 000000000..67a347f9a --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H +#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc new file mode 100644 index 000000000..4eba0a3cd --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -0,0 +1,85 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32fc_32f_multiply_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_32f_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + float * taps; + int i; + + std::complex<float>* result_generic; + std::complex<float>* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats(taps, vlen); + + printf("32fc_32f_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_32f_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h new file mode 100644 index 000000000..fc3b3eeb2 --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc new file mode 100644 index 000000000..64ea65da9 --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc @@ -0,0 +1,83 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32fc_32f_power_32fc_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1.5e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32fc_32f_power_32fc_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + int i; + + std::complex<float>* result_generic; + std::complex<float>* result_sse; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + + const float power = 3.2; + + printf("32fc_32f_power_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32fc_32f_power_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h new file mode 100644 index 000000000..464b7b7cc --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc new file mode 100644 index 000000000..a24382d71 --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc @@ -0,0 +1,75 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32fc_atan2_32f_aligned16.h> +#include <volk/volk_32fc_atan2_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_atan2_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_atan2_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_atan2_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h new file mode 100644 index 000000000..9c4dc209a --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc new file mode 100644 index 000000000..497914e0a --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc @@ -0,0 +1,137 @@ +#include <volk/volk.h> +#include <qa_32fc_conjugate_dot_prod_aligned16.h> +#include <stdlib.h> +#include <math.h> + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h new file mode 100644 index 000000000..507b1769b --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..0f5a030f5 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk.h> +#include <qa_32fc_deinterleave_32f_aligned16.h> +#include <volk/volk_32fc_deinterleave_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..78660e6ad --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc new file mode 100644 index 000000000..6e051afbc --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk.h> +#include <qa_32fc_deinterleave_64f_aligned16.h> +#include <volk/volk_32fc_deinterleave_64f_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_generic1[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + double output_sse21[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_deinterleave_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h new file mode 100644 index 000000000..f924b9752 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..850518524 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32fc_deinterleave_real_16s_aligned16.h> +#include <volk/volk_32fc_deinterleave_real_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..68b80f27d --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..321deb184 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32fc_deinterleave_real_32f_aligned16.h> +#include <volk/volk_32fc_deinterleave_real_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..765450bb6 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc new file mode 100644 index 000000000..aedb2e387 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32fc_deinterleave_real_64f_aligned16.h> +#include <volk/volk_32fc_deinterleave_real_64f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h new file mode 100644 index 000000000..3e55fb812 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc new file mode 100644 index 000000000..bcf9ea954 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.cc @@ -0,0 +1,214 @@ +#include <volk/volk.h> +#include <qa_32fc_dot_prod_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> +#include <stdio.h> + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + + + +#if LV_HAVE_SSE3 +void qa_32fc_dot_prod_aligned16::t1() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result_sse3[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +void qa_32fc_dot_prod_aligned16::t2() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result_sse3[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_32_time: %f\n", total); + + printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t2() { + printf("sse_32 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_64 + +void qa_32fc_dot_prod_aligned16::t3() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex<float>(0,0); + result_sse3[0] = std::complex<float>(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_64_time: %f\n", total); + + printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t3() { + printf("sse_64 not available... no test performed\n"); +} + + + +#endif diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h new file mode 100644 index 000000000..4b360db27 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.h @@ -0,0 +1,20 @@ +#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); + void t2 (); + void t3 (); +}; + + +#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..4d83f1639 --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include <volk/volk.h> +#include <qa_32fc_index_max_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex<float>* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..a4be1616b --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32fc_magnitude_16s_aligned16.h> +#include <volk/volk_32fc_magnitude_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..ffdf1dd9e --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..d69ada408 --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include <volk/volk.h> +#include <qa_32fc_magnitude_32f_aligned16.h> +#include <volk/volk_32fc_magnitude_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..a2881308c --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc new file mode 100644 index 000000000..e1f7eab3d --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.cc @@ -0,0 +1,86 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32fc_multiply_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + int i; + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<float>* input; + std::complex<float>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h new file mode 100644 index 000000000..c8abaa8fe --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..83cdf4b15 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk.h> +#include <qa_32fc_power_spectral_density_32f_aligned16.h> +#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h> +#include <cstdlib> + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc new file mode 100644 index 000000000..4d1359068 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk.h> +#include <qa_32fc_power_spectrum_32f_aligned16.h> +#include <volk/volk_32fc_power_spectrum_32f_aligned16.h> +#include <cstdlib> + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex<float> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); + } + + printf("32fc_power_spectrum_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h new file mode 100644 index 000000000..d991223f3 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc new file mode 100644 index 000000000..d9ead8495 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.cc @@ -0,0 +1,91 @@ +#include <volk/volk.h> +#include <qa_32fc_square_dist_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 10000000 +#define VEC_LEN 64 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex<float>* src0 ; + std::complex<float>* points; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + + printf("32fc_square_dist_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(; i < vlen; ++i) { + //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h new file mode 100644 index 000000000..9d365d8b0 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc new file mode 100644 index 000000000..f923d1d5c --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc @@ -0,0 +1,96 @@ +#include <volk/volk.h> +#include <qa_32fc_square_dist_scalar_mult_aligned16.h> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define ERR_DELTA .0001 +#define NUM_ITERS 10000000 +#define VEC_LEN 64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex<float>* src0 ; + std::complex<float>* points; + float scalar; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + random_floats(&scalar, 1); + + printf("32fc_square_dist_scalar_mult_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(i = 0; i < vlen; ++i) { + printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h new file mode 100644 index 000000000..ac4e3c45b --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc new file mode 100644 index 000000000..72d05cf6f --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32s_and_aligned16.h> +#include <volk/volk_32s_and_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_and_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_and_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_and_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h new file mode 100644 index 000000000..dfcb47c63 --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H +#define INCLUDED_QA_32S_AND_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32s_and_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_and_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc new file mode 100644 index 000000000..eab3fe016 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32s_convert_32f_aligned16.h> +#include <volk/volk_32s_convert_32f_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h new file mode 100644 index 000000000..efd2a2eea --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..0e504cfa1 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32s_convert_32f_unaligned16.h> +#include <volk/volk_32s_convert_32f_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h new file mode 100644 index 000000000..5006f5fd8 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc new file mode 100644 index 000000000..e09dfb91c --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_32s_or_aligned16.h> +#include <volk/volk_32s_or_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_or_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_or_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_or_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h new file mode 100644 index 000000000..9e949eb52 --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H +#define INCLUDED_QA_32S_OR_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32s_or_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_or_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc new file mode 100644 index 000000000..8b1023876 --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include <volk/volk.h> +#include <qa_32u_byteswap_aligned16.h> +#include <volk/volk_32u_byteswap_aligned16.h> +#include <cstdlib> +#include <cstring> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint32_t output0[vlen] __attribute__ ((aligned (16))); + uint32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint32_t)); + printf("32u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h new file mode 100644 index 000000000..47bad4c3d --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc new file mode 100644 index 000000000..49fcddeb2 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_32u_popcnt_aligned16.h> +#include <volk/volk_32u_popcnt_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_32u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_32u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint32_t input0 __attribute__ ((aligned (16))); + + uint32_t output0 __attribute__ ((aligned (16))); + uint32_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint32_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("32u_popcnt_aligned\n"); + + start = clock(); + uint32_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_32u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h new file mode 100644 index 000000000..fa1dc1041 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_32u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc new file mode 100644 index 000000000..0eaebf00a --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_64f_convert_32f_aligned16.h> +#include <volk/volk_64f_convert_32f_aligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + } + printf("64f_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h new file mode 100644 index 000000000..95d79f73d --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc new file mode 100644 index 000000000..dcf94bd27 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_64f_convert_32f_unaligned16.h> +#include <volk/volk_64f_convert_32f_unaligned16.h> +#include <cstdlib> + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + } + printf("64f_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h new file mode 100644 index 000000000..430327e81 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc new file mode 100644 index 000000000..41ab078b0 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_64f_max_aligned16.h> +#include <volk/volk_64f_max_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_max_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + } + printf("64f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h new file mode 100644 index 000000000..7cbd4d4c1 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H +#define INCLUDED_QA_64F_MAX_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc new file mode 100644 index 000000000..b4664d065 --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_64f_min_aligned16.h> +#include <volk/volk_64f_min_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_min_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2)); + } + printf("64f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h new file mode 100644 index 000000000..a0e95395f --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H +#define INCLUDED_QA_64F_MIN_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc new file mode 100644 index 000000000..4f5d4d02b --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include <volk/volk.h> +#include <qa_64u_byteswap_aligned16.h> +#include <volk/volk_64u_byteswap_aligned16.h> +#include <cstdlib> +#include <cstring> + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint64_t output0[vlen] __attribute__ ((aligned (16))); + uint64_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint64_t)); + printf("64u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h new file mode 100644 index 000000000..a4fa0c983 --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc new file mode 100644 index 000000000..bce9ff6c2 --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_64u_popcnt_aligned16.h> +#include <volk/volk_64u_popcnt_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_64u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_64u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint64_t input0 __attribute__ ((aligned (16))); + + uint64_t output0 __attribute__ ((aligned (16))); + uint64_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint64_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("64u_popcnt_aligned\n"); + + start = clock(); + uint64_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_64u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h new file mode 100644 index 000000000..217822d6e --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_64u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc new file mode 100644 index 000000000..35f08fb81 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8s_convert_16s_aligned16.h> +#include <volk/volk_8s_convert_16s_aligned16.h> +#include <cstdlib> + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h new file mode 100644 index 000000000..38739fc96 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc new file mode 100644 index 000000000..bb326f818 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8s_convert_16s_unaligned16.h> +#include <volk/volk_8s_convert_16s_unaligned16.h> +#include <cstdlib> + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_unaligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h new file mode 100644 index 000000000..d39fffc35 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc new file mode 100644 index 000000000..522da0b9d --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8s_convert_32f_aligned16.h> +#include <volk/volk_8s_convert_32f_aligned16.h> +#include <cstdlib> + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h new file mode 100644 index 000000000..7f8401d42 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..ea1fb7c74 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc @@ -0,0 +1,63 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8s_convert_32f_unaligned16.h> +#include <volk/volk_8s_convert_32f_unaligned16.h> +#include <cstdlib> + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_unaligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen+1] __attribute__ ((aligned (16))); + + float output_generic[vlen+1] __attribute__ ((aligned (16))); + float output_sse4_1[vlen+1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%e...%e\n", output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aad2f8c22 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..823e7fe2e --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,67 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_deinterleave_16s_aligned16.h> +#include <volk/volk_8sc_deinterleave_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_11[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse4_11[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..9c99fed70 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..fb580516c --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,134 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_deinterleave_32f_aligned16.h> +#include <volk/volk_8sc_deinterleave_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + float output_sse14_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max<double>((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max<double>((output_generic[i])*1e-4, 1e-4)); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4)); + } +} + + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..63b5fdadb --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..1cc844b52 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,64 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_deinterleave_real_16s_aligned16.h> +#include <volk/volk_8sc_deinterleave_real_16s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..02050926f --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..10e537cde --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,138 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_deinterleave_real_32f_aligned16.h> +#include <volk/volk_8sc_deinterleave_real_32f_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> *input0; + + float* output_generic; + float* output_sse; + float* output_sse4_1; + + ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t)); + ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float)); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0); + } + + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } + + free(input0); + free(output_generic); + free(output_sse); + free(output_sse4_1); +} + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..93338e488 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..d84df8119 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include <volk/volk.h> +#include <qa_8sc_deinterleave_real_8s_aligned16.h> +#include <volk/volk_8sc_deinterleave_real_8s_aligned16.h> +#include <cstdlib> + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..92fc0dd4a --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc new file mode 100644 index 000000000..d64eac8ce --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc @@ -0,0 +1,87 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_multiply_conjugate_16sc_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<int8_t>* input; + std::complex<int8_t>* taps; + + std::complex<int16_t>* result_generic; + std::complex<int16_t>* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t)); + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_16sc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex<int16_t>*)result_generic, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex<int16_t>*)result_sse4_1, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h new file mode 100644 index 000000000..0e78a5eca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc new file mode 100644 index 000000000..c27f0e0ca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc @@ -0,0 +1,87 @@ +#include <volk/volk_runtime.h> +#include <volk/volk.h> +#include <qa_8sc_multiply_conjugate_32fc_aligned16.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex<int8_t>* input; + std::complex<int8_t>* taps; + + std::complex<float>* result_generic; + std::complex<float>* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float)); + + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h new file mode 100644 index 000000000..eb9ae309c --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/TestCase.h> + +class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc new file mode 100644 index 000000000..c3c27b69b --- /dev/null +++ b/volk/lib/qa_volk.cc @@ -0,0 +1,211 @@ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/* + * This class gathers together all the test cases for the example + * directory into a single test suite. As you create new test cases, + * add them here. + */ + +#include <qa_volk.h> +#include <qa_16s_quad_max_star_aligned16.h> +#include <qa_32fc_dot_prod_aligned16.h> +#include <qa_32fc_square_dist_aligned16.h> +#include <qa_32fc_square_dist_scalar_mult_aligned16.h> +#include <qa_32f_sum_of_poly_aligned16.h> +#include <qa_32fc_index_max_aligned16.h> +#include <qa_32f_index_max_aligned16.h> +#include <qa_32fc_conjugate_dot_prod_aligned16.h> +#include <qa_16s_permute_and_scalar_add_aligned16.h> +#include <qa_16s_branch_4_state_8_aligned16.h> +#include <qa_16s_max_star_horizontal_aligned16.h> +#include <qa_16s_max_star_aligned16.h> +#include <qa_16s_add_quad_aligned16.h> +#include <qa_32f_add_aligned16.h> +#include <qa_32f_subtract_aligned16.h> +#include <qa_32f_max_aligned16.h> +#include <qa_32f_min_aligned16.h> +#include <qa_64f_max_aligned16.h> +#include <qa_64f_min_aligned16.h> +#include <qa_32s_and_aligned16.h> +#include <qa_32s_or_aligned16.h> +#include <qa_32f_dot_prod_aligned16.h> +#include <qa_32f_dot_prod_unaligned16.h> +#include <qa_32f_fm_detect_aligned16.h> +#include <qa_32fc_32f_multiply_aligned16.h> +#include <qa_32fc_multiply_aligned16.h> +#include <qa_32f_divide_aligned16.h> +#include <qa_32f_multiply_aligned16.h> +#include <qa_32f_sqrt_aligned16.h> +#include <qa_8sc_multiply_conjugate_16sc_aligned16.h> +#include <qa_8sc_multiply_conjugate_32fc_aligned16.h> +#include <qa_32u_popcnt_aligned16.h> +#include <qa_64u_popcnt_aligned16.h> +#include <qa_16u_byteswap_aligned16.h> +#include <qa_32u_byteswap_aligned16.h> +#include <qa_64u_byteswap_aligned16.h> +#include <qa_32f_normalize_aligned16.h> +#include <qa_16sc_deinterleave_16s_aligned16.h> +#include <qa_16sc_deinterleave_32f_aligned16.h> +#include <qa_16sc_deinterleave_real_16s_aligned16.h> +#include <qa_16sc_deinterleave_real_32f_aligned16.h> +#include <qa_16sc_deinterleave_real_8s_aligned16.h> +#include <qa_16sc_magnitude_16s_aligned16.h> +#include <qa_16sc_magnitude_32f_aligned16.h> +#include <qa_32fc_deinterleave_32f_aligned16.h> +#include <qa_32fc_deinterleave_64f_aligned16.h> +#include <qa_32fc_deinterleave_real_16s_aligned16.h> +#include <qa_32fc_deinterleave_real_32f_aligned16.h> +#include <qa_32fc_deinterleave_real_64f_aligned16.h> +#include <qa_32fc_magnitude_16s_aligned16.h> +#include <qa_32fc_magnitude_32f_aligned16.h> +#include <qa_32f_interleave_16sc_aligned16.h> +#include <qa_32f_interleave_32fc_aligned16.h> +#include <qa_8sc_deinterleave_16s_aligned16.h> +#include <qa_8sc_deinterleave_32f_aligned16.h> +#include <qa_8sc_deinterleave_real_16s_aligned16.h> +#include <qa_8sc_deinterleave_real_32f_aligned16.h> +#include <qa_8sc_deinterleave_real_8s_aligned16.h> +#include <qa_16s_convert_32f_aligned16.h> +#include <qa_16s_convert_32f_unaligned16.h> +#include <qa_16s_convert_8s_aligned16.h> +#include <qa_16s_convert_8s_unaligned16.h> +#include <qa_32f_convert_16s_aligned16.h> +#include <qa_32f_convert_16s_unaligned16.h> +#include <qa_32f_convert_32s_aligned16.h> +#include <qa_32f_convert_32s_unaligned16.h> +#include <qa_32f_convert_64f_aligned16.h> +#include <qa_32f_convert_64f_unaligned16.h> +#include <qa_32f_convert_8s_aligned16.h> +#include <qa_32f_convert_8s_unaligned16.h> +#include <qa_32s_convert_32f_aligned16.h> +#include <qa_32s_convert_32f_unaligned16.h> +#include <qa_64f_convert_32f_aligned16.h> +#include <qa_64f_convert_32f_unaligned16.h> +#include <qa_8s_convert_16s_aligned16.h> +#include <qa_8s_convert_16s_unaligned16.h> +#include <qa_8s_convert_32f_aligned16.h> +#include <qa_8s_convert_32f_unaligned16.h> +#include <qa_32fc_32f_power_32fc_aligned16.h> +#include <qa_32f_power_aligned16.h> +#include <qa_32fc_atan2_32f_aligned16.h> +#include <qa_32fc_power_spectral_density_32f_aligned16.h> +#include <qa_32fc_power_spectrum_32f_aligned16.h> +#include <qa_32f_calc_spectral_noise_floor_aligned16.h> +#include <qa_32f_accumulator_aligned16.h> +#include <qa_32f_stddev_aligned16.h> +#include <qa_32f_stddev_and_mean_aligned16.h> + +CppUnit::TestSuite * +qa_volk::suite() +{ + CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); + + s->addTest(qa_16s_quad_max_star_aligned16::suite()); + s->addTest(qa_32fc_dot_prod_aligned16::suite()); + s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); + s->addTest(qa_32fc_square_dist_aligned16::suite()); + s->addTest(qa_32f_sum_of_poly_aligned16::suite()); + s->addTest(qa_32fc_index_max_aligned16::suite()); + s->addTest(qa_32f_index_max_aligned16::suite()); + s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); + s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); + s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); + s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); + s->addTest(qa_16s_max_star_aligned16::suite()); + s->addTest(qa_16s_add_quad_aligned16::suite()); + s->addTest(qa_32f_add_aligned16::suite()); + s->addTest(qa_32f_subtract_aligned16::suite()); + s->addTest(qa_32f_max_aligned16::suite()); + s->addTest(qa_32f_min_aligned16::suite()); + s->addTest(qa_64f_max_aligned16::suite()); + s->addTest(qa_64f_min_aligned16::suite()); + s->addTest(qa_32s_and_aligned16::suite()); + s->addTest(qa_32s_or_aligned16::suite()); + s->addTest(qa_32f_dot_prod_aligned16::suite()); + s->addTest(qa_32f_dot_prod_unaligned16::suite()); + s->addTest(qa_32f_fm_detect_aligned16::suite()); + s->addTest(qa_32fc_32f_multiply_aligned16::suite()); + s->addTest(qa_32fc_multiply_aligned16::suite()); + s->addTest(qa_32f_divide_aligned16::suite()); + s->addTest(qa_32f_multiply_aligned16::suite()); + s->addTest(qa_32f_sqrt_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite()); + s->addTest(qa_32u_popcnt_aligned16::suite()); + s->addTest(qa_64u_popcnt_aligned16::suite()); + s->addTest(qa_16u_byteswap_aligned16::suite()); + s->addTest(qa_32u_byteswap_aligned16::suite()); + s->addTest(qa_64u_byteswap_aligned16::suite()); + s->addTest(qa_32f_normalize_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_16s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_64f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite()); + s->addTest(qa_32fc_magnitude_16s_aligned16::suite()); + s->addTest(qa_32fc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32f_interleave_16sc_aligned16::suite()); + s->addTest(qa_32f_interleave_32fc_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16s_convert_32f_aligned16::suite()); + s->addTest(qa_16s_convert_32f_unaligned16::suite()); + s->addTest(qa_16s_convert_8s_aligned16::suite()); + s->addTest(qa_16s_convert_8s_unaligned16::suite()); + s->addTest(qa_32f_convert_16s_aligned16::suite()); + s->addTest(qa_32f_convert_16s_unaligned16::suite()); + s->addTest(qa_32f_convert_32s_aligned16::suite()); + s->addTest(qa_32f_convert_32s_unaligned16::suite()); + s->addTest(qa_32f_convert_64f_aligned16::suite()); + s->addTest(qa_32f_convert_64f_unaligned16::suite()); + s->addTest(qa_32f_convert_8s_aligned16::suite()); + s->addTest(qa_32f_convert_8s_unaligned16::suite()); + s->addTest(qa_32s_convert_32f_aligned16::suite()); + s->addTest(qa_32s_convert_32f_unaligned16::suite()); + s->addTest(qa_64f_convert_32f_aligned16::suite()); + s->addTest(qa_64f_convert_32f_unaligned16::suite()); + s->addTest(qa_8s_convert_16s_aligned16::suite()); + s->addTest(qa_8s_convert_16s_unaligned16::suite()); + s->addTest(qa_8s_convert_32f_aligned16::suite()); + s->addTest(qa_8s_convert_32f_unaligned16::suite()); + s->addTest(qa_32fc_32f_power_32fc_aligned16::suite()); + s->addTest(qa_32f_power_aligned16::suite()); + s->addTest(qa_32fc_atan2_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite()); + s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite()); + s->addTest(qa_32f_accumulator_aligned16::suite()); + s->addTest(qa_32f_stddev_aligned16::suite()); + s->addTest(qa_32f_stddev_and_mean_aligned16::suite()); + + return s; +} diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h new file mode 100644 index 000000000..43fa7faba --- /dev/null +++ b/volk/lib/qa_volk.h @@ -0,0 +1,36 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU Example Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Example Public License for more details. + * + * You should have received a copy of the GNU Example Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_QA_VOLK_H +#define INCLUDED_QA_VOLK_H + +#include <cppunit/TestSuite.h> + +//! collect all the tests for the example directory + +class qa_volk { + public: + //! return suite of tests for all of example directory + static CppUnit::TestSuite *suite (); +}; + +#endif /* INCLUDED_QA_VOLK_H */ diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc new file mode 100644 index 000000000..50ac08eab --- /dev/null +++ b/volk/lib/test_all.cc @@ -0,0 +1,82 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002,2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include <cppunit/ui/text/TestRunner.h> +#include <cppunit/TextTestRunner.h> + +#include <qa_volk.h> + +#include <cppunit/XmlOutputter.h> +#include <iostream> +#include <getopt.h> +#include <stdlib.h> +#include <stdio.h> +#include <string> +#include <fstream> + +int +main (int argc, char **argv) +{ + + int opt = 0; + std::string xmlOutputFile(""); + + while( (opt = getopt(argc, argv, "o:")) != -1){ + switch(opt){ + case 'o': + if(optarg){ + xmlOutputFile.assign(optarg); + } + else{ + std::cerr << "No xml file output specified for -o" << std::endl; + exit(EXIT_FAILURE); + } + break; + + default: /* '?' */ + fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n", + argv[0]); + exit(EXIT_FAILURE); + } + + } + + CppUnit::TextUi::TestRunner runner; + + runner.addTest (qa_volk::suite ()); + + bool was_successful = false; + if(!xmlOutputFile.empty()){ + std::ofstream xmlOutput(xmlOutputFile.c_str()); + if(xmlOutput.is_open()){ + runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput)); + + was_successful = runner.run("", false, true, false); + } + xmlOutput.close(); + } + else{ + was_successful = runner.run ("", false); + } + + return was_successful ? 0 : 1; +} diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c new file mode 100644 index 000000000..b1a93db26 --- /dev/null +++ b/volk/lib/volk_rank_archs.c @@ -0,0 +1,13 @@ +#include<volk_rank_archs.h> +#include<stdio.h> + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) { + int i = 2; + unsigned int best_val = 0; + for(; i < arch_defs[0] + 1; ++i) { + if((arch_defs[i]&(!arch)) == 0) { + best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + } + } + return best_val; +} diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h new file mode 100644 index 000000000..26b9f7503 --- /dev/null +++ b/volk/lib/volk_rank_archs.h @@ -0,0 +1,14 @@ +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch); + + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ |