From 1633e9371a0bce876757f1c2c3e4054436b57950 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 18:48:56 -0800 Subject: cpuid: No more compile-time CPU checks. Compiles everything that gcc allows. Configure- and compile-time checks modified to only compile architectures that the compiler will handle. This means that volk will compile every arch that your gcc will compile, no matter if your CPU can execute the instructions or not. This lets you cross-compile volk. volk_rank_archs will be deferred to runtime. This has not been done yet. --- volk/include/volk/make_cpuid_c.py | 184 ++++++++++++++++++++++ volk/include/volk/make_cpuid_generic_c.py | 60 -------- volk/include/volk/make_cpuid_powerpc_c.py | 67 -------- volk/include/volk/make_cpuid_x86_c.py | 133 ---------------- volk/include/volk/make_mktables.py | 2 +- volk/include/volk/make_set_simd.py | 243 +++++++++--------------------- volk/include/volk/volk_register.py | 23 +-- 7 files changed, 257 insertions(+), 455 deletions(-) create mode 100644 volk/include/volk/make_cpuid_c.py delete mode 100644 volk/include/volk/make_cpuid_generic_c.py delete mode 100644 volk/include/volk/make_cpuid_powerpc_c.py delete mode 100644 volk/include/volk/make_cpuid_x86_c.py (limited to 'volk/include') diff --git a/volk/include/volk/make_cpuid_c.py b/volk/include/volk/make_cpuid_c.py new file mode 100644 index 000000000..20621769b --- /dev/null +++ b/volk/include/volk/make_cpuid_c.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from xml.dom import minidom + +HEADER_TEMPL = """\ +/*this file is auto_generated by volk_register.py*/ + +#include +#include + +struct VOLK_CPU volk_cpu; + +#if defined(__i386__) || (__x86_64__) +#include +#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3) + +static inline unsigned int cpuid_eax(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[0]; +} + +static inline unsigned int cpuid_ebx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[1]; +} + +static inline unsigned int cpuid_ecx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[2]; +} + +static inline unsigned int cpuid_edx(unsigned int op) { + unsigned int regs[4]; + cpuid_x86 (op, regs); + return regs[3]; +} +#endif + +""" + +def make_cpuid_c(dom) : + tempstring = HEADER_TEMPL; + + for domarch in dom: + if str(domarch.attributes["type"].value) == "x86": + if "no_test" in domarch.attributes.keys(): + no_test = str(domarch.attributes["no_test"].value); + if no_test == "true": + no_test = True; + else: + no_test = False; + else: + no_test = False; + arch = str(domarch.attributes["name"].value); + op = domarch.getElementsByTagName("op"); + if op: + op = str(op[0].firstChild.data); + reg = domarch.getElementsByTagName("reg"); + if reg: + reg = str(reg[0].firstChild.data); + shift = domarch.getElementsByTagName("shift"); + if shift: + shift = str(shift[0].firstChild.data); + val = domarch.getElementsByTagName("val"); + if val: + val = str(val[0].firstChild.data); + + if no_test: + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + return 1; +#else + return 0; +#endif +} + +""" % (arch) + + elif op == "1": + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + unsigned int e%sx = cpuid_e%sx (%s); + return ((e%sx >> %s) & 1) == %s; +#else + return 0; +#endif +} + +""" % (arch, reg, reg, op, reg, shift, val) + + elif op == "0x80000001": + tempstring = tempstring + """\ +int i_can_has_%s () { +#if defined(__i386__) || (__x86_64__) + unsigned int extended_fct_count = cpuid_eax(0x80000000); + if (extended_fct_count < 0x80000001) + return %s^1; + unsigned int extended_features = cpuid_e%sx (%s); + return ((extended_features >> %s) & 1) == %s; +#else + return 0; +#endif +} + +""" % (arch, val, reg, op, shift, val) + + elif str(domarch.attributes["type"].value) == "powerpc": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { +#ifdef __PPC__ + return 1; +#else + return 0; +#endif +} + +""" % (arch) + + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { + return 1; +} + +""" % (arch) + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + """\ +int i_can_has_%s () { + return 0; +} + +""" % (arch) + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + + + + + + + diff --git a/volk/include/volk/make_cpuid_generic_c.py b/volk/include/volk/make_cpuid_generic_c.py deleted file mode 100644 index c682d4138..000000000 --- a/volk/include/volk/make_cpuid_generic_c.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_generic_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - for domarch in dom: - if str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py deleted file mode 100644 index 0b0ea84e7..000000000 --- a/volk/include/volk/make_cpuid_powerpc_c.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_powerpc_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - #just assume it has them for powerpc - for domarch in dom: - if str(domarch.attributes["type"].value) == "powerpc": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - - tempstring = tempstring + "}\n\n" - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py deleted file mode 100644 index 2b2bd7c91..000000000 --- a/volk/include/volk/make_cpuid_x86_c.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_cpuid_x86_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "#include \n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - - tempstring = tempstring + "#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3)\n\n" - tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[0];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[1];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[2];\n" - tempstring = tempstring + "}\n\n"; - - tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n"; - tempstring = tempstring + " unsigned int regs[4];\n" - tempstring = tempstring + " cpuid_x86 (op, regs);\n" - tempstring = tempstring + " return regs[3];\n" - tempstring = tempstring + "}\n\n"; - - for domarch in dom: - if str(domarch.attributes["type"].value) == "x86": - if "no_test" in domarch.attributes.keys(): - no_test = str(domarch.attributes["no_test"].value); - if no_test == "true": - no_test = True; - else: - no_test = False; - else: - no_test = False; - arch = str(domarch.attributes["name"].value); - op = domarch.getElementsByTagName("op"); - if op: - op = str(op[0].firstChild.data); - reg = domarch.getElementsByTagName("reg"); - if reg: - reg = str(reg[0].firstChild.data); - shift = domarch.getElementsByTagName("shift"); - if shift: - shift = str(shift[0].firstChild.data); - val = domarch.getElementsByTagName("val"); - if val: - val = str(val[0].firstChild.data); - - if no_test: - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - elif op == "1": - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" - tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" - tempstring = tempstring + "}\n\n"; - - elif op == "0x80000001": - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " unsigned int extended_fct_count = cpuid_eax(0x80000000);\n"; - tempstring = tempstring + " if (extended_fct_count < 0x80000001)\n"; - tempstring = tempstring + " return "+ val + "^1;\n\n" - tempstring = tempstring + " unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n"; - tempstring = tempstring + " return ((extended_features >> " + shift + ") & 1) == " + val + ";\n" - tempstring = tempstring + "}\n\n"; - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 1;\n" - tempstring = tempstring + "}\n\n" - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + "int i_can_has_" + arch + " () {\n" - tempstring = tempstring + " return 0;\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - - - - - - - diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py index 051ac268d..25da086fa 100644 --- a/volk/include/volk/make_mktables.py +++ b/volk/include/volk/make_mktables.py @@ -18,7 +18,7 @@ def make_mktables(funclist) : tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; tempstring = tempstring + ' fclose(output);\n' tempstring = tempstring + '}\n'; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index c74b0464d..8aef5bb16 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -15,53 +15,23 @@ # along with this program. If not, see . # +#TODO: add in checking for 32-bit and 64-bit compiler support, to pick +#which version to use. + +#for a given compiler: first try 64 bit, then fall back to 32 bit. +#to try 64 bit, just try compiling a program with -m64 +#if it passes, whee you're 64-bit +#if not, 32 +#this should probably be elsewhere and just referred in here +#actually, why the fuck does LF_CHECK_CC_FLAG([-m64]) work at all? it doesn't even COMPILE + from xml.dom import minidom def make_set_simd(dom) : tempstring = ""; tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; - tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n"; - tempstring = tempstring + "[\n"; - tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; - tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; - tempstring = tempstring + " case \"$MD_CPU\" in\n"; - tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " (powerpc)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " (*)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_generic.c 2>&1`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n"; - tempstring = tempstring + " esac\n"; - tempstring = tempstring + "])\n" - + tempstring = tempstring +'\ndnl define arch checks\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); @@ -73,9 +43,10 @@ def make_set_simd(dom) : tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; tempstring = tempstring + "])\n"; + tempstring = tempstring +'\ndnl main set_simd_flags\n'; tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; tempstring = tempstring + "[\n"; - tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + #tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; tempstring = tempstring + " indCC=no\n"; tempstring = tempstring + " indCXX=no\n"; @@ -96,29 +67,33 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; - tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; - tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; + tempstring = tempstring + " cf_with_lv_arch=\""; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + arch + " "; + tempstring = tempstring[0:-1] + "\"\n"; tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; - tempstring = tempstring + " fi\n"; + + tempstring = tempstring +'\ndnl init LV_HAVE_XXX and then try to add archs\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; - tempstring = tempstring + " case \"$MD_CPU\" in\n"; - tempstring = tempstring + " (x86)\n" for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); - if atype == "x86": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + if atype != "all": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); + tempstring = tempstring +'\ndnl add in flags for arch ' + arch + '\n'; overrule = domarch.getElementsByTagName("overrule"); if overrule: overrule = str(overrule[0].firstChild.data); @@ -131,139 +106,57 @@ def make_set_simd(dom) : overrule_val = ""; flag = domarch.getElementsByTagName("flag"); flag = str(flag[0].firstChild.data); - if atype == "x86": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" + if atype != "all": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; tempstring = tempstring + " fi\n" - - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - elif atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - - tempstring = tempstring + " (powerpc)\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - if atype == "powerpc": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype == "powerpc": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - elif atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " (*)\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype == "all": - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + else: + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " esac\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index bc8f959af..242a8dee7 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -5,12 +5,9 @@ import re import string from xml.dom import minidom from volk_regexp import * -from make_cpuid_x86_c import make_cpuid_x86_c +from make_cpuid_c import make_cpuid_c from make_cpuid_h import make_cpuid_h -from make_proccpu_sim import make_proccpu_sim from make_set_simd import make_set_simd -from make_cpuid_generic_c import make_cpuid_generic_c -from make_cpuid_powerpc_c import make_cpuid_powerpc_c from make_registry import make_registry from make_h import make_h from make_init_h import make_init_h @@ -35,10 +32,7 @@ outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); -outfile_cpu_x86_c = open("../../lib/volk_cpu_x86.c", "w"); -outfile_cpu_generic_c = open("../../lib/volk_cpu_generic.c", "w"); -outfile_cpu_powerpc_c = open("../../lib/volk_cpu_powerpc.c", "w"); -outfile_proccpu_sim = open("../../lib/volk_proccpu_sim.c", "w"); +outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); outfile_mktables = open("../../lib/volk_mktables.c", "w"); @@ -226,21 +220,12 @@ outfile_mktables.close(); outfile_cpu_h.write(make_cpuid_h(filearchs)); outfile_cpu_h.close(); -outfile_cpu_x86_c.write(make_cpuid_x86_c(filearchs)); -outfile_cpu_x86_c.close(); - -outfile_proccpu_sim.write(make_proccpu_sim(filearchs)); -outfile_proccpu_sim.close(); +outfile_cpu_c.write(make_cpuid_c(filearchs)); +outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs)); outfile_set_simd.close(); -outfile_cpu_generic_c.write(make_cpuid_generic_c(filearchs)); -outfile_cpu_generic_c.close(); - -outfile_cpu_powerpc_c.write(make_cpuid_powerpc_c(filearchs)); -outfile_cpu_powerpc_c.close(); - outfile_config_in.write(make_config_in(filearchs)); outfile_config_in.close(); -- cgit From 2387618a492c73ec029599027daeaeebe937e7a7 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 18:58:42 -0800 Subject: comments --- volk/include/volk/make_set_simd.py | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 8aef5bb16..08b0f0937 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -15,16 +15,6 @@ # along with this program. If not, see . # -#TODO: add in checking for 32-bit and 64-bit compiler support, to pick -#which version to use. - -#for a given compiler: first try 64 bit, then fall back to 32 bit. -#to try 64 bit, just try compiling a program with -m64 -#if it passes, whee you're 64-bit -#if not, 32 -#this should probably be elsewhere and just referred in here -#actually, why the fuck does LF_CHECK_CC_FLAG([-m64]) work at all? it doesn't even COMPILE - from xml.dom import minidom def make_set_simd(dom) : -- cgit From d364316fa2e047890af09c7bdeed776d6391b3db Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 21:28:43 -0800 Subject: Removed some mktables stuff since it's passe --- volk/include/volk/make_mktables.py | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 volk/include/volk/make_mktables.py (limited to 'volk/include') diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py deleted file mode 100644 index 25da086fa..000000000 --- a/volk/include/volk/make_mktables.py +++ /dev/null @@ -1,33 +0,0 @@ - - -def make_mktables(funclist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstrgin = tempstring + '#include\n'; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + 'int main() {\n'; - tempstring = tempstring + ' int i = 0;\n'; - tempstring = tempstring + ' FILE* output;\n'; - tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; - tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; - tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; - - for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; - tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; - tempstring = tempstring + ' fclose(output);\n' - tempstring = tempstring + '}\n'; - return tempstring; - - - - - - - - -- cgit From fd03c0ee7cb9af71a5e0292569b626ba3dabd885 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 21:59:19 -0800 Subject: Moved the fn indices gen from volk.c to volk_registry.h so the qa code has access to the static stuff --- volk/include/volk/make_c.py | 18 ------------------ volk/include/volk/make_registry.py | 23 ++++++++++++++++++++++- volk/include/volk/volk_register.py | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 6e75067d0..f845ce7d4 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -43,24 +43,6 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : lindex = tempstring.rfind(","); tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); tempstring = tempstring + "};\n\n"; - - tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind]; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; - - tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; - tags_counter = tags_counter + 1; - - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); - tempstring = tempstring + "};\n\n"; tempstring = tempstring + retlist[i] + "inline " + funclist[i] + "_manual" + arched_arglist[i] + '\n'; tempstring = tempstring + "return " + funclist[i] + "_archs[volk_get_index(" + funclist[i] + "_indices, arch, " + funclist[i] + "_arch_defs)](" + my_arglist[i] + ");" + "\n}\n"; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 8457d61f3..2bc8bda28 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -2,7 +2,7 @@ from xml.dom import minidom from emit_omnilog import * import string -def make_registry(dom, funclist, fcountlist) : +def make_registry(dom, funclist, fcountlist, taglist) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; @@ -24,6 +24,27 @@ def make_registry(dom, funclist, fcountlist) : tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; counter = 0; + + for i in range(len(funclist)): + tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; + + tags_counter = 0; + for arch_list in fcountlist[i]: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + + tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; + tags_counter = tags_counter + 1; + + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n"; + + for fcount in fcountlist: tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; counter = counter + 1; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 242a8dee7..77290f12a 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -229,7 +229,7 @@ outfile_set_simd.close(); outfile_config_in.write(make_config_in(filearchs)); outfile_config_in.close(); -outfile_reg.write(make_registry(filearchs, functions, fcountlist)); +outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); outfile_h.write(make_h(functions, arched_arglist, retlist)); -- cgit From 88c389e1166cf5427cfabc012502337999c2b68f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 22:25:54 -0800 Subject: Revert "Removed some mktables stuff since it's passe" This reverts commit d364316fa2e047890af09c7bdeed776d6391b3db. --- volk/include/volk/make_mktables.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 volk/include/volk/make_mktables.py (limited to 'volk/include') diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py new file mode 100644 index 000000000..25da086fa --- /dev/null +++ b/volk/include/volk/make_mktables.py @@ -0,0 +1,33 @@ + + +def make_mktables(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstrgin = tempstring + '#include\n'; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + 'int main() {\n'; + tempstring = tempstring + ' int i = 0;\n'; + tempstring = tempstring + ' FILE* output;\n'; + tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; + tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; + tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; + + for func in funclist: + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; + tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; + tempstring = tempstring + ' fclose(output);\n' + tempstring = tempstring + '}\n'; + return tempstring; + + + + + + + + -- cgit From 1bf5a3ea4bbb9fd6baf72eb876778d2652c97f2c Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 22:28:38 -0800 Subject: reverted mktables deletion until i get my act together and make it go straight from python -> .h --- volk/include/volk/volk_register.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 77290f12a..460a11fab 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -18,9 +18,9 @@ from make_runtime_c import make_runtime_c from make_init_c import make_init_c from make_runtime import make_runtime from make_typedefs import make_typedefs -from make_mktables import make_mktables from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h +from make_mktables import make_mktables outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -213,9 +213,6 @@ for func in functions: fcountlist.append(fcount); taglist.append(tags); -outfile_mktables.write(make_mktables(functions)); -outfile_mktables.close(); - outfile_cpu_h.write(make_cpuid_h(filearchs)); outfile_cpu_h.close(); @@ -261,3 +258,6 @@ outfile_environment_c.close(); outfile_environment_h.write(make_environment_init_h()); outfile_environment_h.close(); + +outfile_mktables.write(make_mktables(functions)); +outfile_mktables.close(); -- cgit From 5d10579e79b35a3d80af967b73dfe72f7ce14740 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 15 Feb 2011 23:37:12 -0800 Subject: volk: temporarily removed avx arch. when -mavx is used, gcc will promote SSE instructions to their AVX equivalents. while this is a benefit on AVX architectures, it makes it really difficult to selectively enable code based on architecture at runtime. GCC suggests compiling different files with different flags and loading them at runtime. GCC can take a flying leap. --- volk/include/volk/archs.xml | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index a19a5add9..70d55a8be 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -126,14 +126,4 @@ msse4.2 - - - 1 - 1 - c - 28 - mavx - - - -- cgit From 0eeeb636a89c5086293bae31511316e4200ad2f9 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 16 Feb 2011 17:20:09 -0800 Subject: Volk_runtime now does self-initialization. You can call volk_xxx_a16() just like in volk.c. The first run of each function does the rank_archs call. Subsequent calls proceed with no overhead. volk_init is still being generated but not used at all. --- volk/include/volk/make_runtime.py | 10 ++-------- volk/include/volk/make_runtime_c.py | 9 +++++---- volk/include/volk/volk_register.py | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py index 645b3aaee..91e703b24 100644 --- a/volk/include/volk/make_runtime.py +++ b/volk/include/volk/make_runtime.py @@ -4,7 +4,7 @@ from volk_regexp import * -def make_runtime(funclist) : +def make_runtime(funclist, arglist) : tempstring = ""; tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; @@ -18,14 +18,8 @@ def make_runtime(funclist) : tempstring = tempstring + '\n'; - tempstring = tempstring + "struct VOLK_RUNTIME {\n"; - for i in range(len(funclist)): - tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n"; - tempstring = tempstring + "};\n\n"; - - tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n" - tempstring = tempstring + "\nvoid volk_runtime_init();\n"; + tempstring = tempstring + "extern void (*" + funclist[i] + ")(" + arglist[i] + ");\n" tempstring = tempstring + emit_epilog(); tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py index 070df9ba7..0519dddf8 100644 --- a/volk/include/volk/make_runtime_c.py +++ b/volk/include/volk/make_runtime_c.py @@ -3,7 +3,7 @@ import string from volk_regexp import * -def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : +def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist) : tempstring = ""; tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; @@ -18,8 +18,6 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun for func in funclist: tempstring = tempstring + "#include\n" ; tempstring = tempstring + '\n'; - - tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n"; for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; @@ -42,6 +40,9 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; - tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n'; + tempstring = tempstring + " %s = %s_archs[volk_rank_archs(%s_arch_defs, volk_get_lvarch())];\n" % (funclist[i], funclist[i], funclist[i]) + tempstring = tempstring + " %s(%s);\n}\n\n" % (funclist[i], my_arglist[i]) + + tempstring = tempstring + "%s(*%s)(%s) = &default_acquire_%s;\n\n" % (retlist[i], funclist[i], my_argtypelist[i], funclist[i]) return tempstring; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 460a11fab..b719042df 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -241,13 +241,13 @@ outfile_config_fixed.close(); outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); outfile_c.close(); -outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist)); outfile_runtime_c.close(); outfile_init_c.write(make_init_c(functions, filearchs)); outfile_init_c.close(); -outfile_runtime.write(make_runtime(functions)); +outfile_runtime.write(make_runtime(functions, my_argtypelist)); outfile_runtime.close(); outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); -- cgit From 26c81b79e28f3d4d9a5eb5b5a56338371ff118ce Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 16 Feb 2011 17:30:16 -0800 Subject: Fixed mktables for the old non-runtime volk. --- volk/include/volk/Makefile.am | 3 ++- volk/include/volk/make_mktables.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 7a5edd624..23822b229 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -133,7 +133,8 @@ volkinclude_HEADERS = \ VOLK_MKTABLES_SOURCES = \ $(platform_CODE) \ $(top_srcdir)/lib/volk_rank_archs.c \ - $(top_srcdir)/lib/volk_mktables.c + $(top_srcdir)/lib/volk_mktables.c \ + $(top_srcdir)/lib/volk_cpu.c volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py index 25da086fa..051ac268d 100644 --- a/volk/include/volk/make_mktables.py +++ b/volk/include/volk/make_mktables.py @@ -18,7 +18,7 @@ def make_mktables(funclist) : tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", 0);\n'; + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; tempstring = tempstring + ' fclose(output);\n' tempstring = tempstring + '}\n'; -- cgit From cef9e33e01e946d82564f517de501cafcb5b1f32 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 9 Mar 2011 17:30:02 -0800 Subject: Volk: first steps to conditional compilation/multiple obj files. --- volk/include/volk/Makefile.am | 6 ------ volk/include/volk/archs.xml | 8 ++++++++ volk/include/volk/make_c.py | 6 +++--- volk/include/volk/make_environment_init_c.py | 5 ++--- volk/include/volk/make_h.py | 1 - volk/include/volk/make_init_c.py | 4 ++-- volk/include/volk/make_registry.py | 15 +++++++-------- volk/include/volk/make_runtime.py | 1 - volk/include/volk/make_runtime_c.py | 7 +++---- volk/include/volk/make_set_simd.py | 18 ++++++++++-------- volk/include/volk/volk_16i_branch_4_state_8_a16.h | 4 ++-- volk/include/volk/volk_16i_convert_8i_a16.h | 2 +- volk/include/volk/volk_16i_convert_8i_u.h | 2 +- volk/include/volk/volk_16i_max_star_16i_a16.h | 4 ++-- .../volk/volk_16i_max_star_horizontal_16i_a16.h | 4 ++-- .../include/volk/volk_16i_permute_and_scalar_add_a16.h | 4 ++-- volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 6 +++--- volk/include/volk/volk_16i_s32f_convert_32f_u.h | 6 +++--- volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h | 4 ++-- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 4 ++-- volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h | 8 ++++---- .../include/volk/volk_16ic_deinterleave_real_16i_a16.h | 6 +++--- volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 6 +++--- volk/include/volk/volk_16ic_magnitude_16i_a16.h | 8 ++++---- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 6 +++--- .../volk/volk_16ic_s32f_deinterleave_real_32f_a16.h | 6 +++--- volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h | 8 ++++---- volk/include/volk/volk_16u_byteswap_a16.h | 6 +++--- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 4 ++-- volk/include/volk/volk_32f_convert_64f_a16.h | 2 +- volk/include/volk/volk_32f_convert_64f_u.h | 2 +- volk/include/volk/volk_32f_index_max_16u_a16.h | 6 +++--- .../include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 4 ++-- .../volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 4 ++-- volk/include/volk/volk_32f_s32f_normalize_a16.h | 6 +++--- volk/include/volk/volk_32f_s32f_power_32f_a16.h | 14 +++++++------- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_sqrt_32f_a16.h | 6 +++--- .../include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_add_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_divide_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 8 ++++---- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 8 ++++---- volk/include/volk/volk_32f_x2_interleave_32fc_a16.h | 4 ++-- volk/include/volk/volk_32f_x2_max_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_min_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 6 +++--- .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 6 +++--- volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 6 +++--- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 4 ++-- volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h | 6 +++--- volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h | 4 ++-- volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h | 4 ++-- .../include/volk/volk_32fc_deinterleave_real_32f_a16.h | 4 ++-- .../include/volk/volk_32fc_deinterleave_real_64f_a16.h | 4 ++-- volk/include/volk/volk_32fc_index_max_16u_a16.h | 4 ++-- volk/include/volk/volk_32fc_magnitude_32f_a16.h | 8 ++++---- volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 14 +++++++------- .../volk/volk_32fc_s32f_deinterleave_real_16i_a16.h | 4 ++-- volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h | 8 ++++---- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 8 ++++---- .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 8 ++++---- .../volk_32fc_s32f_x2_power_spectral_density_32f_a16.h | 8 ++++---- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 6 +++--- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 4 ++-- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 10 +++++----- volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 6 +++--- ...volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 4 ++-- volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h | 4 ++-- volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 4 ++-- volk/include/volk/volk_32i_s32f_convert_32f_u.h | 4 ++-- volk/include/volk/volk_32i_x2_and_32i_a16.h | 6 +++--- volk/include/volk/volk_32i_x2_or_32i_a16.h | 6 +++--- volk/include/volk/volk_32u_byteswap_a16.h | 4 ++-- volk/include/volk/volk_32u_popcnt_a16.h | 4 ++-- volk/include/volk/volk_64f_convert_32f_a16.h | 2 +- volk/include/volk/volk_64f_convert_32f_u.h | 2 +- volk/include/volk/volk_64f_x2_max_64f_a16.h | 4 ++-- volk/include/volk/volk_64f_x2_min_64f_a16.h | 4 ++-- volk/include/volk/volk_64u_byteswap_a16.h | 4 ++-- volk/include/volk/volk_64u_popcnt_a16.h | 4 ++-- volk/include/volk/volk_8i_convert_16i_a16.h | 6 +++--- volk/include/volk/volk_8i_convert_16i_u.h | 4 ++-- volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 6 +++--- volk/include/volk/volk_8i_s32f_convert_32f_u.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h | 4 ++-- volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h | 4 ++-- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 6 +++--- .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 6 +++--- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 4 ++-- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 4 ++-- volk/include/volk/volk_common.h | 2 +- volk/include/volk/volk_register.py | 11 ++++++++--- 100 files changed, 275 insertions(+), 271 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 23822b229..caf707138 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,7 +26,6 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk BUILT_SOURCES: \ - volk_config.h \ volk_tables.h volkinclude_HEADERS = \ @@ -34,7 +33,6 @@ volkinclude_HEADERS = \ volk_common.h \ volk_config_fixed.h \ volk_runtime.h \ - volk_config.h \ volk_tables.h \ volk_typedefs.h \ volk_registry.h \ @@ -143,12 +141,8 @@ volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -volk_config.h: $(top_builddir)/volk_config.h - cp $^ $(top_builddir)/include/volk/$@ - distclean-local: rm -f volk_config_fixed.h - rm -f volk_config.h rm -f volk_cpu.h rm -f volk.h rm -f volk_registry.h diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index 70d55a8be..2547df590 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -126,4 +126,12 @@ msse4.2 + + 1 + 1 + c + 28 + mavx + + diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f845ce7d4..89bf9ea1a 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -30,12 +30,12 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; tags_counter = tags_counter + 1; diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py index e06c7f246..263d5bcd1 100644 --- a/volk/include/volk/make_environment_init_c.py +++ b/volk/include/volk/make_environment_init_c.py @@ -4,13 +4,12 @@ def make_environment_init_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include\n" - tempstring = tempstring + "#include\n" for domarch in dom: arch = str(domarch.attributes["name"].value); incs = domarch.getElementsByTagName("include"); for inc in incs: my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + "#include<" + my_inc + ">\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + '\n\n'; @@ -21,7 +20,7 @@ def make_environment_init_c(dom) : envs = domarch.getElementsByTagName("environment"); for env in envs: cmd = str(env.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + " " + cmd + "\n"; tempstring = tempstring + "#endif\n" diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 81d9ad401..6aea441b7 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -11,7 +11,6 @@ def make_h(funclist, arched_arglist, retlist) : tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n' tempstring = tempstring + emit_prolog() diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py index 330e19592..0a7010cd6 100644 --- a/volk/include/volk/make_init_c.py +++ b/volk/include/volk/make_init_c.py @@ -12,7 +12,7 @@ def make_init_c(funclist, dom) : incs = domarch.getElementsByTagName("include"); for inc in incs: my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + "#include<" + my_inc + ">\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + '\n\n'; @@ -32,7 +32,7 @@ def make_init_c(funclist, dom) : for env in envs: cmd = str(env.firstChild.data); tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; - tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + " " + cmd + "\n"; tempstring = tempstring + "#endif\n" tempstring = tempstring + " }\n"; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 2bc8bda28..7fbe9a8b0 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -7,7 +7,6 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; - tempstring = tempstring +'#include\n'; tempstring = tempstring +'#include\n'; tempstring = tempstring + emit_prolog(); tempstring = tempstring + '\n' @@ -17,7 +16,7 @@ def make_registry(dom, funclist, fcountlist, taglist) : for domarch in dom: arch = str(domarch.attributes["name"].value); - tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring +"#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; tempstring = tempstring +"#else\n"; tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; @@ -30,12 +29,12 @@ def make_registry(dom, funclist, fcountlist, taglist) : tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; tags_counter = tags_counter + 1; @@ -59,11 +58,11 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); tempstring = tempstring + ",\n" for arch_list in fcount: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n" tempstring = tempstring + " (1 << LV_" for ind in range(len(arch_list)): diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py index 91e703b24..d468487d7 100644 --- a/volk/include/volk/make_runtime.py +++ b/volk/include/volk/make_runtime.py @@ -11,7 +11,6 @@ def make_runtime(funclist, arglist) : tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + emit_prolog(); diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py index 0519dddf8..99cdf395f 100644 --- a/volk/include/volk/make_runtime_c.py +++ b/volk/include/volk/make_runtime_c.py @@ -9,7 +9,6 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; tempstring = tempstring + "#include\n"; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; @@ -24,12 +23,12 @@ def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcoun tags_counter = 0; for arch_list in fcountlist[i]: - tempstring = tempstring + "#if LV_HAVE_" + tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; + tempstring = tempstring + arch_list[ind] + ")"; if ind < len(arch_list) - 1: - tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + " && defined(LV_HAVE_"; tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; tags_counter = tags_counter + 1; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 08b0f0937..da631d217 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -54,7 +54,7 @@ def make_set_simd(dom) : tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; if str(domarch.attributes["type"].value) == "all": arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; + tempstring = tempstring + " AC_DEFINE(LV_MAKE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; @@ -68,11 +68,11 @@ def make_set_simd(dom) : tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; tempstring = tempstring + " fi\n"; - tempstring = tempstring +'\ndnl init LV_HAVE_XXX and then try to add archs\n'; + tempstring = tempstring +'\ndnl init LV_MAKE_XXX and then try to add archs\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=no\n"; for domarch in dom: arch = str(domarch.attributes["name"].value); @@ -121,10 +121,9 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + #tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" tempstring = tempstring + " indCXX=no\n" @@ -140,13 +139,16 @@ def make_set_simd(dom) : tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " fi\n" tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indLV_ARCH=no\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h index 3437c1a6b..5eb03b346 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a16.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h @@ -8,7 +8,7 @@ -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -137,7 +137,7 @@ static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* s #endif /*LV_HAVE_SSEs*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h index 73e45ad63..4d51e5903 100644 --- a/volk/include/volk/volk_16i_convert_8i_a16.h +++ b/volk/include/volk/volk_16i_convert_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the input 16 bit integer data into 8 bit integer data diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h index 5fc792b56..df1084fe0 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the input 16 bit integer data into 8 bit integer data diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h index ff57bd2a1..063444279 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a16.h +++ b/volk/include/volk/volk_16i_max_star_16i_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -85,7 +85,7 @@ static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, #endif /*LV_HAVE_SSSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h index 695e08dbf..ece6adb40 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include #include @@ -109,7 +109,7 @@ static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, #endif /*LV_HAVE_SSSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h index e52a949fb..ae1a18157 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h @@ -8,7 +8,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include #include @@ -116,7 +116,7 @@ static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, sho #endif /*LV_HAVE_SSEs*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h index 83fd26ff9..09bc252f0 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -58,7 +58,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, con } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! @@ -94,7 +94,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 16 bit input data buffer diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h index 8f0dd0083..d34acc091 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -59,7 +59,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! @@ -96,7 +96,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 16 bit input data buffer diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h index e4ec5ab4e..94e5eb986 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h @@ -9,7 +9,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include @@ -167,7 +167,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* #endif /*LV_HAVE_SSE2*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { int i = 0; diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h index 5744ca3a6..c157bf64a 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h @@ -9,7 +9,7 @@ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include #include @@ -111,7 +111,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* #endif /*LV_HAVE_SSE2*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h index 7e08bf182..227a92303 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data @@ -52,7 +52,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data @@ -120,7 +120,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int1 } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data \param complexVector The complex input vector @@ -140,7 +140,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, i } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Deinterleaves the complex 16 bit vector into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h index 388c00592..35d0e8be2 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into I vector data @@ -47,7 +47,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, c #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex 16 bit vector into I vector data @@ -96,7 +96,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, co } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index 55a25702e..bdf5fc162 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data @@ -59,7 +59,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data \param complexVector The complex input vector @@ -77,7 +77,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index bdcace750..e75d54ec4 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -76,7 +76,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -153,7 +153,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -173,7 +173,7 @@ static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC_DISABLED +#ifdef LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index 606de2fc5..dcb2499fa 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data @@ -68,7 +68,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data \param complexVector The complex input vector @@ -89,7 +89,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index 62331e496..f21fe77f8 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 16 bit vector into I float vector data @@ -52,7 +52,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffe } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 16 bit vector into I float vector data @@ -99,7 +99,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 16 bit vector into I float vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index ae64efbeb..388d2ebcd 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -70,7 +70,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -140,7 +140,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -161,7 +161,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVect } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC_DISABLED +#ifdef LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h index c8128dbab..f393c05c5 100644 --- a/volk/include/volk/volk_16u_byteswap_a16.h +++ b/volk/include/volk/volk_16u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -43,7 +43,7 @@ static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. \param intsToSwap The vector of data to byte swap @@ -61,7 +61,7 @@ static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 4a3588e6d..6a85e066e 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Accumulates the values in the input buffer @@ -42,7 +42,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Accumulates the values in the input buffer \param result The accumulated result diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h index c303dc118..8ca83220b 100644 --- a/volk/include/volk/volk_32f_convert_64f_a16.h +++ b/volk/include/volk/volk_32f_convert_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the float values into double values diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h index a825767de..387baa3b9 100644 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the float values into double values diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index d070e17d5..3934d2db7 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { @@ -63,7 +63,7 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const #endif /*LV_HAVE_SSE4_1*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { @@ -122,7 +122,7 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h index ff4d5b19c..6efd21a37 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. @@ -78,7 +78,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. \param outputVector The byte-aligned vector where the results will be stored. diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 168245d65..55d4e0319 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the spectral noise floor of an input power spectrum @@ -116,7 +116,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the spectral noise floor of an input power spectrum diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index d6b16e336..9d1d0ef4d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value @@ -44,7 +44,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, con } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index 4d306e53c..06228ef7d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value @@ -45,7 +45,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index ae874fd7b..82c74bf44 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value @@ -40,7 +40,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, con } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index 561fcd800..253a48ae3 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value @@ -41,7 +41,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index f64f2a213..8dab0cdf4 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 420693571..72b193c9d 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value @@ -52,7 +52,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h index 0850cddf7..e6195cd32 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a16.h +++ b/volk/include/volk/volk_32f_s32f_normalize_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) @@ -41,7 +41,7 @@ static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Normalizes the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -60,7 +60,7 @@ static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const f } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Normalizes the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h index 3ed594d9a..ecff901e1 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -26,7 +26,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa float* cPtr = cVector; const float* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; @@ -62,10 +62,10 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -83,7 +83,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* float* cPtr = cVector; const float* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 zeroValue = _mm_setzero_ps(); __m128 signMask; @@ -119,7 +119,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Takes each the input vector value to the specified power and stores the results in the return vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 32f4fa067..48d2fe1fe 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Calculates the standard deviation of the input buffer using the supplied mean @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the standard deviation of the input buffer using the supplied mean @@ -111,7 +111,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the standard deviation of the input buffer using the supplied mean \param stddev The calculated standard deviation diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h index 513c2cffe..a9ce76f88 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a16.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Sqrts the two input vectors and store their results in the third vector @@ -40,7 +40,7 @@ static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVecto } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Sqrts the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -58,7 +58,7 @@ static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aV } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); /*! \brief Sqrts the two input vectors and store their results in the third vector diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index 278089841..f1cb2ae0e 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Calculates the standard deviation and mean of the input buffer @@ -78,7 +78,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the standard deviation and mean of the input buffer @@ -134,7 +134,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the standard deviation and mean of the input buffer \param stddev The calculated standard deviation diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h index d0d0e0a0e..2de6a6644 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Adds the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Adds the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Adds the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h index d844e25b0..1603e78de 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Divides the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* a } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Divides the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const floa } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Divides the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index 61aa56815..d13f12e51 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -4,7 +4,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { @@ -24,7 +24,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const fl #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { @@ -73,7 +73,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -120,7 +120,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 8469a3cea..7c1136a67 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -4,7 +4,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { @@ -24,7 +24,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { @@ -73,7 +73,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -120,7 +120,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h index 29c9392df..f3731fa2a 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Interleaves the I & Q vector data into the complex vector @@ -48,7 +48,7 @@ static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Interleaves the I & Q vector data into the complex vector. \param iBuffer The I buffer data to be interleaved diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h index 26e7f1246..60be6e36d 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -67,7 +67,7 @@ static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h index 23bae044c..3b8291531 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVec } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -67,7 +67,7 @@ static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h index a0dcfa86e..cef17f5a6 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplys the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const fl } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplys the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index 30306774d..e98735245 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. @@ -62,7 +62,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexV } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. @@ -127,7 +127,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. \param iBuffer The I buffer data to be interleaved diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h index 7404bfe79..c01f2c1f3 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Subtracts bVector form aVector and store their results in the cVector @@ -43,7 +43,7 @@ static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Subtracts bVector form aVector and store their results in the cVector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const fl } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Subtracts bVector form aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h index af9e39537..6e446cbef 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h @@ -9,7 +9,7 @@ #define MAX(X,Y) ((X) > (Y)?(X):(Y)) #endif -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -98,7 +98,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* sr #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h index 514998800..846315a4a 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Multiplies the input complex vector with the input float vector and store their results in the third vector @@ -56,7 +56,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector \param cVector The vector where the results will be stored @@ -76,7 +76,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h index 84d2576ed..3e7c3fa28 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector into I & Q vector data @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h index 34262a7af..945a26742 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, doubl } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h index 9838ec88b..3c3fb2583 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector into I vector data @@ -44,7 +44,7 @@ static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h index af392d074..40c1a7a46 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Deinterleaves the complex vector into I vector data @@ -42,7 +42,7 @@ static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, con } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h index 532ae4e7c..0ad1edbe9 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a16.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -188,7 +188,7 @@ static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32f #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { float sq_dist = 0.0; float max = 0.0; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h index be7216dce..946190e41 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a16.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -48,7 +48,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, cons } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector @@ -96,7 +96,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -115,7 +115,7 @@ static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, c } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h index e9f74438d..55b1b6c70 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -27,7 +27,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -67,10 +67,10 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -89,7 +89,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const const unsigned int quarterPoints = num_points / 4; const float invNormalizeFactor = 1.0 / normalizeFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -131,7 +131,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief performs the atan2 on the input vector and stores the results in the output vector. \param outputVector The vector where the results will be stored. diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 31465bff9..1e3e61e08 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data @@ -54,7 +54,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 530359600..14318ab01 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector @@ -60,7 +60,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector @@ -120,7 +120,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values @@ -140,7 +140,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVe } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h index 3507fdb3c..90cc4f5e7 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -4,10 +4,10 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -25,7 +25,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 vPower = _mm_set_ps1(power); __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Takes each the input complex vector value to the specified power and stores the results in the return vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h index 39d8f7aa2..03da069c2 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -24,7 +24,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut float* destPtr = logPowerOutput; uint64_t number = 0; const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 magScalar = _mm_set_ps1(10.0); magScalar = _mm_div_ps(magScalar, logf4(magScalar)); @@ -88,7 +88,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the log10 power value for each input point \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h index 0120b5307..5bcd7f7c4 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h @@ -5,10 +5,10 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ @@ -27,7 +27,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* const float iRBW = 1.0 / rbw; const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH +#ifdef LV_HAVE_LIB_SIMDMATH __m128 magScalar = _mm_set_ps1(10.0); magScalar = _mm_div_ps(magScalar, logf4(magScalar)); @@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Calculates the log10 power value divided by the RBW for each input point \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index a01971df3..2e5036f69 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -59,7 +59,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE && LV_HAVE_64 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -202,7 +202,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul } #endif -#if LV_HAVE_SSE && LV_HAVE_32 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 2fa5918cc..69781f0fb 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -57,7 +57,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index 9a7b65ab4..9bfb88934 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -6,7 +6,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -56,7 +56,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE && LV_HAVE_64 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -192,7 +192,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #endif -#if LV_HAVE_SSE && LV_HAVE_32 +#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -316,7 +316,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, cons #endif /*LV_HAVE_SSE*/ -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include @@ -373,7 +373,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h index b4214f5d2..72010b855 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include /*! \brief Multiplies the two input complex vectors and stores their results in the third vector @@ -53,7 +53,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -73,7 +73,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h index 6a863b16d..910f51679 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h @@ -6,7 +6,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -105,7 +105,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h index 406097fc8..551f3cb53 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE3 +#ifdef LV_HAVE_SSE3 #include #include @@ -91,7 +91,7 @@ static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h index 0fcadd9cb..b744c7197 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -47,7 +47,7 @@ static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 32 bit input data buffer diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h index 1dd6422f8..d8afd218c 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -48,7 +48,7 @@ static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const i #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 32 bit input data buffer diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h index 3baa1d856..4d50efd32 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a16.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Ands the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Ands the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32 } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Ands the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h index 0be22f00a..9edbdbafd 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a16.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Ors the two input vectors and store their results in the third vector @@ -43,7 +43,7 @@ static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* a } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Ors the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored @@ -63,7 +63,7 @@ static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_ } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Ors the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h index 7556ec7b1..dc5cedab9 100644 --- a/volk/include/volk/volk_32u_byteswap_a16.h +++ b/volk/include/volk/volk_32u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -51,7 +51,7 @@ static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int32_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h index f6e25e4e8..0d8b48fd5 100644 --- a/volk/include/volk/volk_32u_popcnt_a16.h +++ b/volk/include/volk/volk_32u_popcnt_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { @@ -23,7 +23,7 @@ static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t val #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 +#ifdef LV_HAVE_SSE4_2 #include diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h index 7dca065f0..cfcdbdc3a 100644 --- a/volk/include/volk/volk_64f_convert_32f_a16.h +++ b/volk/include/volk/volk_64f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the double values into float values diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h index 6338c1433..5c323230a 100644 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Converts the double values into float values diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h index 4b0c1f5f1..21f488bf7 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a16.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* a } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h index aa961e384..8711a0eae 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a16.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector @@ -45,7 +45,7 @@ static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* a } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector \param cVector The vector where the results will be stored diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h index 0eefe0138..b4bed8451 100644 --- a/volk/include/volk/volk_64u_byteswap_a16.h +++ b/volk/include/volk/volk_64u_byteswap_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE2 +#ifdef LV_HAVE_SSE2 #include /*! @@ -59,7 +59,7 @@ static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int } #endif /* LV_HAVE_SSE2 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int64_t's. \param intsToSwap The vector of data to byte swap diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h index 59511dc29..b416d052e 100644 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -5,7 +5,7 @@ #include -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { @@ -36,7 +36,7 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 && LV_HAVE_64 +#if defined(LV_HAVE_SSE4_2) && defined(LV_HAVE_64) #include diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h index 3d7045753..260ac40a1 100644 --- a/volk/include/volk/volk_8i_convert_16i_a16.h +++ b/volk/include/volk/volk_8i_convert_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -47,7 +47,7 @@ static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const i } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer @@ -65,7 +65,7 @@ static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h index bcff13406..7d7104f52 100644 --- a/volk/include/volk/volk_8i_convert_16i_u.h +++ b/volk/include/volk/volk_8i_convert_16i_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -48,7 +48,7 @@ static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into 16 bit integer data \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h index 99a24ec10..9991b150e 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -66,7 +66,7 @@ static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, cons } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer @@ -86,7 +86,7 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#ifdef LV_HAVE_ORC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h index 1e30957e8..3cd6bb67c 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! @@ -67,7 +67,7 @@ static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value \param inputVector The 8 bit input data buffer diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h index 91c9b2c58..249acab49 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data @@ -51,7 +51,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h index bf3dc20dd..7b64b37c5 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data @@ -42,7 +42,7 @@ static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, c #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h index 13de79423..a1abad487 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSSE3 +#ifdef LV_HAVE_SSSE3 #include /*! \brief Deinterleaves the complex 8 bit vector into I vector data @@ -43,7 +43,7 @@ static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, cons } #endif /* LV_HAVE_SSSE3 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index 22c3ebb23..80032d2fe 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data @@ -74,7 +74,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data @@ -136,7 +136,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index 5f1430394..47a968ac1 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -4,7 +4,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Deinterleaves the complex 8 bit vector into I float vector data @@ -61,7 +61,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_SSE +#ifdef LV_HAVE_SSE #include /*! \brief Deinterleaves the complex 8 bit vector into I float vector data @@ -107,7 +107,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c } #endif /* LV_HAVE_SSE */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex 8 bit vector into I float vector data \param complexVector The complex input vector diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index d9cacbf46..014f662a3 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector @@ -68,7 +68,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector \param cVector The complex vector where the results will be stored diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index 6ec923a4f..ccf5eaa9d 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -5,7 +5,7 @@ #include #include -#if LV_HAVE_SSE4_1 +#ifdef LV_HAVE_SSE4_1 #include /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector @@ -87,7 +87,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t } #endif /* LV_HAVE_SSE4_1 */ -#if LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC /*! \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector \param cVector The complex vector where the results will be stored diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 6f444ad89..0218e668c 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -2,7 +2,7 @@ #define INCLUDED_LIBVECTOR_COMMON_H #include -#if LV_HAVE_MMX +#ifdef LV_HAVE_MMX #include union bit128{ uint16_t i16[8]; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index b719042df..20b77da6c 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -21,6 +21,7 @@ from make_typedefs import make_typedefs from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables +from make_makefile_am import make_makefile_am outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -33,11 +34,12 @@ outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); -outfile_config_in = open("../../volk_config.h.in", "w"); +#outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); +outfile_makefile_am = open("../../lib/Makefile.am", "w"); infile = open("Makefile.am", "r"); @@ -223,8 +225,8 @@ outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs)); outfile_set_simd.close(); -outfile_config_in.write(make_config_in(filearchs)); -outfile_config_in.close(); +#outfile_config_in.write(make_config_in(filearchs)); +#outfile_config_in.close(); outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); @@ -261,3 +263,6 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); + +outfile_makefile_am.write(make_makefile_am(filearchs)) +outfile_makefile_am.close() -- cgit From 258186d5ca2e811ced7ea637fd16e3ed3bb5573e Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 22 Mar 2011 16:55:07 -0700 Subject: Interim commit. --- volk/include/volk/make_makefile_am.py | 140 +++++++++++++++++++++ volk/include/volk/make_set_simd.py | 13 +- .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 4 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 4 +- volk/include/volk/volk_64u_popcnt_a16.h | 2 +- volk/include/volk/volk_register.py | 43 ++++++- 6 files changed, 195 insertions(+), 11 deletions(-) create mode 100644 volk/include/volk/make_makefile_am.py (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py new file mode 100644 index 000000000..89ea1f4b6 --- /dev/null +++ b/volk/include/volk/make_makefile_am.py @@ -0,0 +1,140 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from xml.dom import minidom + +def make_makefile_am(dom, machines): + tempstring = r""" +# This file is automatically generated by make_makefile_am.py. +# Do not edit this file. + +include $(top_srcdir)/Makefile.common + +#FIXME: forcing the top_builddir for distcheck seems like a bit +# of a hack. Figure out the right way to do this to find built +# volk_config.h and volk_tables.h + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ + -I$(top_builddir)/include \ + $(WITH_INCLUDES) + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la + +EXTRA_DIST = \ + volk_mktables.c \ + volk_rank_archs.h \ + volk_proccpu_sim.c \ + gcc_x86_cpuid.h + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +libvolk_runtime_la_SOURCES = \ + $(platform_CODE) \ + volk_runtime.c \ + volk_rank_archs.c \ + volk_cpu.c + +libvolk_la_SOURCES = \ + $(platform_CODE) \ + volk.c \ + volk_environment_init.c + +volk_orc_LDFLAGS = \ + $(ORC_LDFLAGS) \ + -lorc-0.4 + +volk_orc_LIBADD = \ + ../orc/libvolk_orc.la + +""" + + #here be dragons + for machine_name in machines: + tempstring += "if LV_MACHINE_" + machine_name.swapcase() + tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + + + tempstring += """ +if LV_HAVE_ORC +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) +else +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_la_LIBADD = +endif + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +#libvolk_qa_la_SOURCES = \ +# qa_utils.cc + +#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost + +#libvolk_qa_la_LIBADD = \ +# libvolk.la \ +# libvolk_runtime.la + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_utils.h + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + testqa + +testqa_SOURCES = testqa.cc qa_utils.cc +testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) +testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) +if LV_HAVE_ORC +testqa_LDADD = \ + libvolk.la \ + libvolk_runtime.la \ + ../orc/libvolk_orc.la +else +testqa_LDADD = \ + libvolk.la +endif + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +""" + + + return tempstring diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index da631d217..5a848e59e 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -17,7 +17,7 @@ from xml.dom import minidom -def make_set_simd(dom) : +def make_set_simd(dom, machines) : tempstring = ""; tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; @@ -148,7 +148,16 @@ def make_set_simd(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; - + + tempstring += "\n" + #now we can define the machines we're compiling + for machine_name in machines: + tempstring += " AM_CONDITIONAL(LV_MACHINE_" + machine_name.swapcase() + ", " + marchlist = machines[machine_name] + for march in marchlist: + tempstring += "test \"$LV_MAKE_" + march.swapcase() + "\" == \"yes\" && " + + tempstring += "test true)\n" #just so we don't have to detect the last one in the group, i know tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index 2e5036f69..d78faf5b5 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -59,7 +59,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) +#if LV_HAVE_SSE && LV_HAVE_64 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -202,7 +202,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul } #endif -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) +#if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index 9bfb88934..b7b9768ab 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -56,7 +56,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_64) +#if LV_HAVE_SSE && LV_HAVE_64 static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { @@ -192,7 +192,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #endif -#if defined(LV_HAVE_SSE) && defined(LV_HAVE_32) +#if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h index b416d052e..8b92e91a1 100644 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -36,7 +36,7 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #endif /*LV_HAVE_GENERIC*/ -#if defined(LV_HAVE_SSE4_2) && defined(LV_HAVE_64) +#if LV_HAVE_SSE4_2 && LV_HAVE_64 #include diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 20b77da6c..d5a4a968f 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -22,6 +22,7 @@ from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables from make_makefile_am import make_makefile_am +import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); @@ -91,7 +92,41 @@ for arch in archs: archs_or = archs_or[0:len(archs_or)-1]; archs_or = archs_or + ")"; - +#get machine list and parse to a list of machines, each with a list of archs (none of this DOM crap) +machine_str_dict = {} +mfile = minidom.parse("machines.xml"); +filemachines = mfile.getElementsByTagName("machine") + +for filemachine in filemachines: + machine_str_dict[str(filemachine.attributes["name"].value)] = str(filemachine.getElementsByTagName("archs")[0].firstChild.data).split() + +#all right now you have a dict of arch lists +#next we expand it +#this is an expanded list accounting for the OR syntax +#TODO: make this work for multiple "|" machines +machines = {} +already_done = False +for machine_name in machine_str_dict: + already_done = False + marchlist = machine_str_dict[machine_name] + for march in marchlist: + or_marchs = march.split("|") + if len(or_marchs) > 1: + marchlist.remove(march) + for or_march in or_marchs: + tempmarchlist = copy.deepcopy(marchlist) + tempmarchlist.append(or_march) + machines[machine_name + "_" + or_march] = tempmarchlist + already_done = True + + if not already_done: + machines[machine_name] = marchlist + +#for machine_name in machines: +# print machine_name + ": " + str(machines[machine_name]) + +#ok, now we have all the machines we're going to build. next step is to generate a Makefile.am where they're all laid out and compiled + taglist = []; fcountlist = []; arched_arglist = []; @@ -107,7 +142,7 @@ for func in functions: sourcefile = infile_source.readlines(); infile_source.close(); for line in sourcefile: - +#FIXME: make it work for multiple #if define()s archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); if archline: arch = archline.group(0); @@ -222,7 +257,7 @@ outfile_cpu_h.close(); outfile_cpu_c.write(make_cpuid_c(filearchs)); outfile_cpu_c.close(); -outfile_set_simd.write(make_set_simd(filearchs)); +outfile_set_simd.write(make_set_simd(filearchs, machines)); outfile_set_simd.close(); #outfile_config_in.write(make_config_in(filearchs)); @@ -264,5 +299,5 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); -outfile_makefile_am.write(make_makefile_am(filearchs)) +outfile_makefile_am.write(make_makefile_am(filearchs, machines)) outfile_makefile_am.close() -- cgit From 5fffe801f95f2ef8bddf51aea8ed260eae0bf7b8 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 13 Apr 2011 18:32:28 -0700 Subject: Volk: make_makefile_am.py changes to generate cflags, ldflags. no conditional linking yet. --- volk/include/volk/make_makefile_am.py | 19 +++++++++++++++---- volk/include/volk/volk_register.py | 8 ++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 89ea1f4b6..c44fe11af 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -17,7 +17,7 @@ from xml.dom import minidom -def make_makefile_am(dom, machines): +def make_makefile_am(dom, machines, archflags_dict): tempstring = r""" # This file is automatically generated by make_makefile_am.py. # Do not edit this file. @@ -67,9 +67,20 @@ volk_orc_LIBADD = \ """ #here be dragons - for machine_name in machines: - tempstring += "if LV_MACHINE_" + machine_name.swapcase() - tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + for machine_name in machines: + tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" + tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + for arch in machines[machine_name]: + if archflags_dict[arch] != "none": + tempstring += "-" + archflags_dict[arch] + " " + + tempstring += "\nlibvolk_" + machine_name + "_la_CFLAGS = " + for arch in machines[machine_name]: + if archflags_dict[arch] != "none": + tempstring += "-DLV_HAVE_" + arch.swapcase() + " " + tempstring += "\nlibvolk_" + machine_name + "_la_SOURCES = $(libvolk_la_SOURCES)" + tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".la" + tempstring += "\nendif\n" tempstring += """ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index d5a4a968f..10610dcfe 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -83,8 +83,12 @@ for arch in archs: a_var = re.search("^\$", arch); if a_var: archs.remove(arch); + + - +archflags_dict = {} +for filearch in filearchs: + archflags_dict[str(filearch.attributes["name"].value)] = str(filearch.getElementsByTagName("flag")[0].firstChild.data) archs_or = "(" for arch in archs: @@ -299,5 +303,5 @@ outfile_environment_h.close(); outfile_mktables.write(make_mktables(functions)); outfile_mktables.close(); -outfile_makefile_am.write(make_makefile_am(filearchs, machines)) +outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() -- cgit From 1877a842c81a78edd14d725b20a05b29b84c7b52 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 12:11:08 -0700 Subject: Volk: volk_machine structures generated, volk_machines is instantiated. Testing manually works OK. All that's left is top-level volk.c and Automake wrangling. --- volk/include/volk/machines.xml | 47 ++++++++++++ volk/include/volk/make_each_machine_c.py | 83 ++++++++++++++++++++++ volk/include/volk/make_machines_c.py | 57 +++++++++++++++ volk/include/volk/make_machines_h.py | 50 +++++++++++++ volk/include/volk/make_makefile_am.py | 13 ++-- volk/include/volk/make_typedefs.py | 2 +- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 3 +- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 4 +- volk/include/volk/volk_common.h | 11 ++- volk/include/volk/volk_regexp.py | 6 ++ volk/include/volk/volk_register.py | 17 +++++ 11 files changed, 278 insertions(+), 15 deletions(-) create mode 100644 volk/include/volk/machines.xml create mode 100644 volk/include/volk/make_each_machine_c.py create mode 100644 volk/include/volk/make_machines_c.py create mode 100644 volk/include/volk/make_machines_h.py (limited to 'volk/include') diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml new file mode 100644 index 000000000..8eed9e8d7 --- /dev/null +++ b/volk/include/volk/machines.xml @@ -0,0 +1,47 @@ + + + +generic + + + +generic 32|64 mmx + + + +generic 32|64 mmx sse + + + +generic 32|64 mmx sse sse2 + + + +generic 32|64 mmx sse sse2 sse3 + + + +generic 32|64 mmx sse sse2 sse3 ssse3 + + + +generic 32|64 mmx sse sse2 sse3 sse4_a popcount + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx + + + +generic altivec + + + diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py new file mode 100644 index 000000000..abf4bb2d5 --- /dev/null +++ b/volk/include/volk/make_each_machine_c.py @@ -0,0 +1,83 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * +import string +from emit_omnilog import * + +#ok todo list: +#put n_archs into the info struct so it doesn't have to be arch_defs[0]. + +def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): + tempstring = r""" +// This file is automatically generated by make_each_machine_c.py. +// Do not edit this file. +""" + for arch in archs: + tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" + + tempstring += """ +#include +#include +#include + +""" + for func in functions: + tempstring += "#include \n" + tempstring += "\n\n" + + tempstring += emit_prolog(); + + for i in range(len(functions)): + tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" + + tags_counter = 0 + for arch_list in fcountlist[i]: + ok = True + for arch in arch_list: + if arch.swapcase() not in archs: + ok = False + if ok: + tempstring += " " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tags_counter += 1 + + tempstring = strip_trailing(tempstring, ",") + tempstring += "};\n\n" + + + tempstring += "static unsigned int caps = " + for arch in archs: + tempstring += "(1 << LV_" + arch.swapcase() + ") + " + tempstring = strip_trailing(tempstring, " + ") + tempstring += ";\n" + + tempstring += "static const char* name = \"" + machine_name + "\";\n" + tempstring += "struct volk_machine volk_machine_" + machine_name + " = {\n" + tempstring += " caps,\n" + tempstring += " name,\n" + + for i in range(len(functions)): + tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs },\n" + tempstring += " " + functions[i] + "_archs,\n" + + tempstring = strip_trailing(tempstring, ",") + tempstring += "};\n" + tempstring += emit_epilog(); + + return tempstring + + diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py new file mode 100644 index 000000000..55c0f1c06 --- /dev/null +++ b/volk/include/volk/make_machines_c.py @@ -0,0 +1,57 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * + +def make_machines_c(machines): + tempstring = r""" +// This file is automatically generated by make_machines_c.py. +// Do not edit this file. + +#include +#include +#include + +volk_machine volk_machines[] = { +""" + for machine in machines: + tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" + tempstring += "volk_machine_" + machine + tempstring += "," + tempstring += "\n#endif\n" + + tempstring += r""" +}; + +""" + + for machine in machines: + tempstring += "#if LV_MACHINE_" + machine.swapcase() + "\n" + tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 1\n" + tempstring += "#else\n" + tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 0\n" + tempstring += "#endif\n" + + tempstring += """unsigned int n_volk_machines = +""" + for machine in machines: + tempstring += "(LV_MACHINE_" + machine.swapcase() + "_CNT) " + tempstring += "+ " + tempstring = tempstring[:-2] + tempstring += ";\n" + + return tempstring diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py new file mode 100644 index 000000000..68cdf3363 --- /dev/null +++ b/volk/include/volk/make_machines_h.py @@ -0,0 +1,50 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_regexp import * + +def make_machines_h(functions, machines): + tempstring = r""" +// This file is automatically generated by make_machines_h.py. +// Do not edit this file. + +#ifndef INCLUDED_LIBVOLK_MACHINES_H +#define INCLUDED_LIBVOLK_MACHINES_H + +#include +#include + +struct volk_machine { + const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) + const char *name; +""" + for function in functions: + tempstring += "\n const struct volk_func_desc " + function + "_desc;\n" + tempstring += " const " + replace_volk.sub("p", function) + " *" + function + "_archs;\n" + + tempstring += r"""}; + +""" + for machine in machines: + tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" + tempstring += "extern const struct volk_machine volk_machine_" + machine + ";\n" + tempstring += """#endif\n""" + + tempstring += r""" +#endif //INCLUDED_LIBVOLK_MACHINES_H""" + + return tempstring diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index c44fe11af..1c4c8fe7d 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -69,17 +69,16 @@ volk_orc_LIBADD = \ #here be dragons for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = " + tempstring += "libvolk_" + machine_name + "_ar_LDFLAGS = " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " - tempstring += "\nlibvolk_" + machine_name + "_la_CFLAGS = " - for arch in machines[machine_name]: - if archflags_dict[arch] != "none": - tempstring += "-DLV_HAVE_" + arch.swapcase() + " " - tempstring += "\nlibvolk_" + machine_name + "_la_SOURCES = $(libvolk_la_SOURCES)" - tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".la" +# tempstring += "\nlibvolk_" + machine_name + "_ar_CFLAGS = " +# for arch in machines[machine_name]: +# tempstring += "-DLV_HAVE_" + arch.swapcase() + " " + tempstring += "\nlibvolk_" + machine_name + "_ar_SOURCES = libvolk_machine_" + machine_name + ".cc" + tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".ar" tempstring += "\nendif\n" diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py index fe81cb2b0..8f9f2b55e 100644 --- a/volk/include/volk/make_typedefs.py +++ b/volk/include/volk/make_typedefs.py @@ -16,7 +16,7 @@ def make_typedefs(funclist, retlist, my_argtypelist) : tempstring = tempstring + '\n'; for i in range(len(funclist)): - tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n"; + tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n"; tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index 014f662a3..9e8982e9b 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -23,7 +23,6 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values @@ -37,7 +36,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe y = _mm_sign_epi16(y, conjugateSign); // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); // Calculate the ar*(-ci) + cr*(ai) imagz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index ccf5eaa9d..fa58ff058 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -24,7 +24,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); for(;number < quarterPoints; number++){ @@ -39,7 +39,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t y = _mm_sign_epi16(y, conjugateSign); // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); // Calculate the ar*(-ci) + cr*(ai) imagz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 0218e668c..e050600f0 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_LIBVECTOR_COMMON_H -#define INCLUDED_LIBVECTOR_COMMON_H +#ifndef INCLUDED_LIBVOLK_COMMON_H +#define INCLUDED_LIBVOLK_COMMON_H #include #ifdef LV_HAVE_MMX @@ -15,4 +15,9 @@ union bit128{ }; #endif /*LV_HAVE_MMX*/ -#endif /*INCLUDED_LIBVECTOR_COMMON_H*/ +struct volk_func_desc { + const char **indices; + const int *arch_defs; +}; + +#endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py index 7b695cb3b..b83ce5206 100644 --- a/volk/include/volk/volk_regexp.py +++ b/volk/include/volk/volk_regexp.py @@ -1,4 +1,5 @@ import re +import string remove_after_underscore = re.compile("_.*"); space_remove = re.compile(" "); @@ -6,3 +7,8 @@ leading_space_remove = re.compile("^ *"); replace_arch = re.compile(", const char\* arch"); replace_bracket = re.compile(" {"); replace_volk = re.compile("volk"); + +def strip_trailing(tostrip, stripstr): + lindex = tostrip.rfind(stripstr) + tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, ""); + return tostrip diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 10610dcfe..9d33abe89 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -22,6 +22,9 @@ from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h from make_mktables import make_mktables from make_makefile_am import make_makefile_am +from make_machines_h import make_machines_h +from make_machines_c import make_machines_c +from make_each_machine_c import make_each_machine_c import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); @@ -41,6 +44,8 @@ outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); +outfile_machines_h = open("volk_machines.h", "w"); +outfile_machines_c = open("../../lib/volk_machines.c", "w"); infile = open("Makefile.am", "r"); @@ -305,3 +310,15 @@ outfile_mktables.close(); outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() + +outfile_machines_h.write(make_machines_h(functions, machines)) +outfile_machines_h.close() + +outfile_machines_c.write(make_machines_c(machines)) +outfile_machines_c.close() + +for machine in machines: + machine_c_filename = "../../lib/volk_machine_" + machine + ".c" + outfile_machine_c = open(machine_c_filename, "w") + outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) + outfile_machine_c.close() -- cgit From 8608fc3ae5f891eb5c11345a9b6102fdc8ae2393 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 17:51:28 -0700 Subject: Volk: volk.c is now generated. manual build works OK. --- volk/include/volk/make_c.py | 127 +++++++++++++++++++++-------------- volk/include/volk/make_h.py | 28 ++++---- volk/include/volk/make_init_c.py | 42 ------------ volk/include/volk/make_init_h.py | 26 ------- volk/include/volk/make_machines_c.py | 2 +- volk/include/volk/make_mktables.py | 33 --------- volk/include/volk/make_runtime.py | 27 -------- volk/include/volk/make_runtime_c.py | 47 ------------- volk/include/volk/volk_register.py | 51 +++----------- 9 files changed, 102 insertions(+), 281 deletions(-) delete mode 100644 volk/include/volk/make_init_c.py delete mode 100644 volk/include/volk/make_init_h.py delete mode 100644 volk/include/volk/make_mktables.py delete mode 100644 volk/include/volk/make_runtime.py delete mode 100644 volk/include/volk/make_runtime_c.py (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 89bf9ea1a..5f6669e40 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -1,55 +1,84 @@ -from xml.dom import minidom -import string +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + from volk_regexp import * +import string +from emit_omnilog import * +#ok todo list: +#put n_archs into the info struct so it doesn't have to be arch_defs[0]. -def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - for func in funclist: - tempstring = tempstring + "#include\n" ; - tempstring = tempstring + '\n'; - - tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n"; - tempstring = tempstring + " int i = 1;\n" - tempstring = tempstring + " for(;i +#include +#include +#include +#include +#include +#include +#include + +""" + tempstring += emit_prolog(); - tempstring = tempstring + retlist[i] + "inline " + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; +#OK here's the deal. the .h prototypes the functions. the .c impls them as fptrs, can use p_whatever. +#also .c impls the get_machine call +#also .c impls the default call for each fn + +#here do static fn get arch + tempstring += r""" +struct volk_machine *get_machine(void) { + extern struct volk_machine volk_machines[]; + extern unsigned int n_volk_machines; + static struct volk_machine *machine = NULL; - tempstring = tempstring + funclist[i] + "_archs[" + funclist[i] + "_func_table](" + my_arglist[i] + ");" + '\n'; - tempstring = tempstring + "}\n\n"; + if(machine != NULL) return machine; + else { + unsigned int max_score = 0; + int i; + for(i=0; i max_score) { + max_score = volk_machines[i].caps; + machine = &(volk_machines[i]); + } + } + } + printf("Using Volk machine: %s\n", machine->name); + return machine; + } +} + +""" - return tempstring; + for i in range(len(functions)): + tempstring += "void get_" + functions[i] + replace_arch.sub("", arched_arglist[i]) + "\n" + tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i]) + tempstring += " %s(%s);\n}\n\n" % (functions[i], my_arglist[i]) + tempstring += replace_volk.sub("p", functions[i]) + " " + functions[i] + " = &get_" + functions[i] + ";\n\n" + + + tempstring += emit_epilog(); + + return tempstring + + diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 6aea441b7..b235cd657 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -4,24 +4,24 @@ from volk_regexp import * -def make_h(funclist, arched_arglist, retlist) : +def make_h(funclist) : tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; + tempstring = tempstring + '/*this file is auto generated by make_h.py*/\n'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n' - tempstring = tempstring + emit_prolog() - tempstring = tempstring + '\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + emit_prolog(); - for i in range(len(retlist)): - tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; - tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n'; + tempstring = tempstring + '\n'; + + for i in range(len(funclist)): + tempstring = tempstring + "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n"; - return tempstring; + diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py deleted file mode 100644 index 0a7010cd6..000000000 --- a/volk/include/volk/make_init_c.py +++ /dev/null @@ -1,42 +0,0 @@ -from xml.dom import minidom - -def make_init_c(funclist, dom) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - incs = domarch.getElementsByTagName("include"); - for inc in incs: - my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + "#include<" + my_inc + ">\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + '\n\n'; - - tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n"; - tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n"; - tempstring = tempstring + " return &volk_runtime;\n"; - tempstring = tempstring + "}\n\n" - tempstring = tempstring + " void volk_runtime_init() {\nvolk_cpu_init();\n"; - - for func in funclist: - tempstring = tempstring + " volk_runtime." + func + " = default_acquire_" + func + ";\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - envs = domarch.getElementsByTagName("environment"); - for env in envs: - cmd = str(env.firstChild.data); - tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + " " + cmd + "\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + " }\n"; - - tempstring = tempstring + "}\n"; - - return tempstring diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py deleted file mode 100644 index 6dbe1c585..000000000 --- a/volk/include/volk/make_init_h.py +++ /dev/null @@ -1,26 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - - - -def make_init_h(funclist, arched_arglist, retlist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - - tempstring = tempstring + '\n'; - - tempstring = tempstring + emit_prolog(); - - for i in range(len(retlist)): - tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; - - tempstring= tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n"; - - return tempstring; diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py index 55c0f1c06..12eabf98d 100644 --- a/volk/include/volk/make_machines_c.py +++ b/volk/include/volk/make_machines_c.py @@ -26,7 +26,7 @@ def make_machines_c(machines): #include #include -volk_machine volk_machines[] = { +struct volk_machine volk_machines[] = { """ for machine in machines: tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py deleted file mode 100644 index 051ac268d..000000000 --- a/volk/include/volk/make_mktables.py +++ /dev/null @@ -1,33 +0,0 @@ - - -def make_mktables(funclist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstrgin = tempstring + '#include\n'; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + 'int main() {\n'; - tempstring = tempstring + ' int i = 0;\n'; - tempstring = tempstring + ' FILE* output;\n'; - tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; - tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; - tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; - - for func in funclist: - tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; - tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; - tempstring = tempstring + ' fclose(output);\n' - tempstring = tempstring + '}\n'; - return tempstring; - - - - - - - - diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py deleted file mode 100644 index d468487d7..000000000 --- a/volk/include/volk/make_runtime.py +++ /dev/null @@ -1,27 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - - - -def make_runtime(funclist, arglist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + emit_prolog(); - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "extern void (*" + funclist[i] + ")(" + arglist[i] + ");\n" - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py deleted file mode 100644 index 99cdf395f..000000000 --- a/volk/include/volk/make_runtime_c.py +++ /dev/null @@ -1,47 +0,0 @@ -from xml.dom import minidom -import string -from volk_regexp import * - - -def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - - - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + "#include\n"; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - - for func in funclist: - tempstring = tempstring + "#include\n" ; - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - - tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; - tags_counter = tags_counter + 1; - - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); - tempstring = tempstring + "};\n\n"; - - - tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; - tempstring = tempstring + " %s = %s_archs[volk_rank_archs(%s_arch_defs, volk_get_lvarch())];\n" % (funclist[i], funclist[i], funclist[i]) - tempstring = tempstring + " %s(%s);\n}\n\n" % (funclist[i], my_arglist[i]) - - tempstring = tempstring + "%s(*%s)(%s) = &default_acquire_%s;\n\n" % (retlist[i], funclist[i], my_argtypelist[i], funclist[i]) - - return tempstring; diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 9d33abe89..437fb33b1 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -9,38 +9,29 @@ from make_cpuid_c import make_cpuid_c from make_cpuid_h import make_cpuid_h from make_set_simd import make_set_simd from make_registry import make_registry -from make_h import make_h -from make_init_h import make_init_h from make_config_fixed import make_config_fixed -from make_config_in import make_config_in -from make_c import make_c -from make_runtime_c import make_runtime_c -from make_init_c import make_init_c -from make_runtime import make_runtime from make_typedefs import make_typedefs from make_environment_init_c import make_environment_init_c from make_environment_init_h import make_environment_init_h -from make_mktables import make_mktables from make_makefile_am import make_makefile_am from make_machines_h import make_machines_h from make_machines_c import make_machines_c from make_each_machine_c import make_each_machine_c +from make_c import make_c +from make_h import make_h import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); outfile_h = open("volk.h", "w"); outfile_c = open("../../lib/volk.c", "w"); -outfile_runtime = open("volk_runtime.h", "w"); -outfile_runtime_c = open("../../lib/volk_runtime.c", "w"); outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); -outfile_init_c = open("../../lib/volk_init.c", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); #outfile_config_in = open("../../volk_config.h.in", "w"); outfile_config_fixed = open("volk_config_fixed.h", "w"); -outfile_mktables = open("../../lib/volk_mktables.c", "w"); +#outfile_mktables = open("../../lib/volk_mktables.c", "w"); outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); @@ -268,46 +259,16 @@ outfile_cpu_c.close(); outfile_set_simd.write(make_set_simd(filearchs, machines)); outfile_set_simd.close(); - -#outfile_config_in.write(make_config_in(filearchs)); -#outfile_config_in.close(); outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); outfile_reg.close(); -outfile_h.write(make_h(functions, arched_arglist, retlist)); -outfile_h.close(); - -outfile_init_h.write(make_init_h(functions, arched_arglist, retlist)); -outfile_init_h.close(); - outfile_config_fixed.write(make_config_fixed(filearchs)); outfile_config_fixed.close(); -outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); -outfile_c.close(); - -outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist, my_argtypelist)); -outfile_runtime_c.close(); - -outfile_init_c.write(make_init_c(functions, filearchs)); -outfile_init_c.close(); - -outfile_runtime.write(make_runtime(functions, my_argtypelist)); -outfile_runtime.close(); - outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); outfile_typedefs.close(); -outfile_environment_c.write(make_environment_init_c(filearchs)); -outfile_environment_c.close(); - -outfile_environment_h.write(make_environment_init_h()); -outfile_environment_h.close(); - -outfile_mktables.write(make_mktables(functions)); -outfile_mktables.close(); - outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) outfile_makefile_am.close() @@ -317,6 +278,12 @@ outfile_machines_h.close() outfile_machines_c.write(make_machines_c(machines)) outfile_machines_c.close() +outfile_c.write(make_c(machines, archs, functions, arched_arglist, my_arglist)) +outfile_c.close() + +outfile_h.write(make_h(functions)) +outfile_h.close() + for machine in machines: machine_c_filename = "../../lib/volk_machine_" + machine + ".c" outfile_machine_c = open(machine_c_filename, "w") -- cgit From d59273f379d7ab997e50c1dd8b0ac3dfc7bd1e33 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 22:48:13 -0700 Subject: Volk: lib compiles & links & works for hand test. testsuite doesn't due to lack of _manual() capability. --- volk/include/volk/Makefile.am | 18 ------------ volk/include/volk/machines.xml | 2 ++ volk/include/volk/make_c.py | 14 +++++----- volk/include/volk/make_each_machine_c.py | 6 ++-- volk/include/volk/make_machines_c.py | 6 ++-- volk/include/volk/make_machines_h.py | 4 +-- volk/include/volk/make_makefile_am.py | 48 +++++++++++++------------------- volk/include/volk/volk_register.py | 6 ++-- 8 files changed, 39 insertions(+), 65 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index caf707138..876bb7a60 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -25,15 +25,10 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk -BUILT_SOURCES: \ - volk_tables.h - volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ - volk_runtime.h \ - volk_tables.h \ volk_typedefs.h \ volk_registry.h \ volk.h \ @@ -128,19 +123,6 @@ volkinclude_HEADERS = \ volk_8i_s32f_convert_32f_a16.h \ volk_8i_s32f_convert_32f_u.h -VOLK_MKTABLES_SOURCES = \ - $(platform_CODE) \ - $(top_srcdir)/lib/volk_rank_archs.c \ - $(top_srcdir)/lib/volk_mktables.c \ - $(top_srcdir)/lib/volk_cpu.c - - -volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include - -volk_tables.h: volk_mktables$(EXEEXT) - ./volk_mktables$(EXEEXT) - distclean-local: rm -f volk_config_fixed.h rm -f volk_cpu.h diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml index 8eed9e8d7..ad71da177 100644 --- a/volk/include/volk/machines.xml +++ b/volk/include/volk/machines.xml @@ -4,6 +4,7 @@ generic + generic 32|64 mmx sse sse2 diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 5f6669e40..7cb0616c8 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -28,13 +28,13 @@ def make_c(machines, archs, functions, arched_arglist, my_arglist): // Do not edit this file. """ tempstring += """ -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include "volk_rank_archs.h" +#include #include """ diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index abf4bb2d5..866e18bc3 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -31,9 +31,9 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" tempstring += """ -#include -#include -#include +#include +#include +#include """ for func in functions: diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py index 12eabf98d..9ad56fb62 100644 --- a/volk/include/volk/make_machines_c.py +++ b/volk/include/volk/make_machines_c.py @@ -22,9 +22,9 @@ def make_machines_c(machines): // This file is automatically generated by make_machines_c.py. // Do not edit this file. -#include -#include -#include +#include +#include +#include struct volk_machine volk_machines[] = { """ diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py index 68cdf3363..674ee12cd 100644 --- a/volk/include/volk/make_machines_h.py +++ b/volk/include/volk/make_machines_h.py @@ -25,8 +25,8 @@ def make_machines_h(functions, machines): #ifndef INCLUDED_LIBVOLK_MACHINES_H #define INCLUDED_LIBVOLK_MACHINES_H -#include -#include +#include +#include struct volk_machine { const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 1c4c8fe7d..2c9ef8fb6 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -24,20 +24,14 @@ def make_makefile_am(dom, machines, archflags_dict): include $(top_srcdir)/Makefile.common -#FIXME: forcing the top_builddir for distcheck seems like a bit -# of a hack. Figure out the right way to do this to find built -# volk_config.h and volk_tables.h - AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ -I$(top_builddir)/include \ $(WITH_INCLUDES) lib_LTLIBRARIES = \ - libvolk.la \ - libvolk_runtime.la + libvolk.la EXTRA_DIST = \ - volk_mktables.c \ volk_rank_archs.h \ volk_proccpu_sim.c \ gcc_x86_cpuid.h @@ -46,16 +40,14 @@ EXTRA_DIST = \ # The main library # ---------------------------------------------------------------- -libvolk_runtime_la_SOURCES = \ +libvolk_la_SOURCES = \ $(platform_CODE) \ - volk_runtime.c \ + volk.cc \ + volk_cpu.c \ volk_rank_archs.c \ - volk_cpu.c + volk_machines.cc -libvolk_la_SOURCES = \ - $(platform_CODE) \ - volk.c \ - volk_environment_init.c +libvolk_la_LDFLAGS = volk_orc_LDFLAGS = \ $(ORC_LDFLAGS) \ @@ -66,31 +58,31 @@ volk_orc_LIBADD = \ """ - #here be dragons + #here be dragons + tempstring += "libvolk_la_LIBADD = \n" + tempstring += "libvolk_la_CPPFLAGS = $(AM_CPPFLAGS)\n" + tempstring += "noinst_LTLIBRARIES = \n" for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_ar_LDFLAGS = " + tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " -# tempstring += "\nlibvolk_" + machine_name + "_ar_CFLAGS = " -# for arch in machines[machine_name]: -# tempstring += "-DLV_HAVE_" + arch.swapcase() + " " - tempstring += "\nlibvolk_" + machine_name + "_ar_SOURCES = libvolk_machine_" + machine_name + ".cc" - tempstring += "\nlibvolk_la_LIBADD = libvolk_" + machine_name + ".ar" + tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " + tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" + tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " " tempstring += "\nendif\n" tempstring += """ if LV_HAVE_ORC -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD = $(volk_orc_LIBADD) +libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD += $(volk_orc_LIBADD) else -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_la_LIBADD = +libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_la_LIBADD += endif @@ -125,7 +117,6 @@ testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) if LV_HAVE_ORC testqa_LDADD = \ libvolk.la \ - libvolk_runtime.la \ ../orc/libvolk_orc.la else testqa_LDADD = \ @@ -141,7 +132,6 @@ distclean-local: rm -f volk_init.h rm -f volk_mktables.c rm -f volk_proccpu_sim.c - rm -f volk_runtime.c rm -f volk_tables.h rm -f volk_environment_init.c """ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 437fb33b1..a319e9dad 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -24,7 +24,7 @@ import copy outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); outfile_reg = open("volk_registry.h", "w"); outfile_h = open("volk.h", "w"); -outfile_c = open("../../lib/volk.c", "w"); +outfile_c = open("../../lib/volk.cc", "w"); outfile_typedefs = open("volk_typedefs.h", "w"); outfile_init_h = open("../../lib/volk_init.h", "w"); outfile_cpu_h = open("volk_cpu.h", "w"); @@ -36,7 +36,7 @@ outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); outfile_environment_h = open("volk_environment_init.h", "w"); outfile_makefile_am = open("../../lib/Makefile.am", "w"); outfile_machines_h = open("volk_machines.h", "w"); -outfile_machines_c = open("../../lib/volk_machines.c", "w"); +outfile_machines_c = open("../../lib/volk_machines.cc", "w"); infile = open("Makefile.am", "r"); @@ -285,7 +285,7 @@ outfile_h.write(make_h(functions)) outfile_h.close() for machine in machines: - machine_c_filename = "../../lib/volk_machine_" + machine + ".c" + machine_c_filename = "../../lib/volk_machine_" + machine + ".cc" outfile_machine_c = open(machine_c_filename, "w") outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) outfile_machine_c.close() -- cgit From 7c48568abc3f0aa918e199f68acca5740250ee5d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 15 Apr 2011 23:04:29 -0700 Subject: Volk: hack Orc back in --- volk/include/volk/make_makefile_am.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 2c9ef8fb6..bd6283b58 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -65,7 +65,7 @@ volk_orc_LIBADD = \ for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " @@ -80,9 +80,11 @@ volk_orc_LIBADD = \ if LV_HAVE_ORC libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) libvolk_la_LIBADD += $(volk_orc_LIBADD) +volk_orc_CFLAGS = -DLV_HAVE_ORC else libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 libvolk_la_LIBADD += +volk_orc_CFLAGS = endif -- cgit From 8b04d29221719239b52a300a338513f05746ed7f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 10:05:43 -0700 Subject: Volk: split n_archs out of arch_defs[0], began to add _manual support --- volk/include/volk/make_c.py | 9 ++++++++- volk/include/volk/make_each_machine_c.py | 2 +- volk/include/volk/make_registry.py | 33 +++++++++++++++++--------------- volk/include/volk/volk_common.h | 1 + 4 files changed, 28 insertions(+), 17 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 7cb0616c8..1129285bb 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -66,13 +66,20 @@ struct volk_machine *get_machine(void) { printf("Using Volk machine: %s\n", machine->name); return machine; } +} + +static unsigned int get_index(const char **indices, char *arch_name) { + + + + } """ for i in range(len(functions)): tempstring += "void get_" + functions[i] + replace_arch.sub("", arched_arglist[i]) + "\n" - tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i]) + tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_desc.arch_defs, get_machine()->%s_desc.n_archs, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i], functions[i]) tempstring += " %s(%s);\n}\n\n" % (functions[i], my_arglist[i]) tempstring += replace_volk.sub("p", functions[i]) + " " + functions[i] + " = &get_" + functions[i] + ";\n\n" diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 866e18bc3..11f669566 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -71,7 +71,7 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tempstring += " name,\n" for i in range(len(functions)): - tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs },\n" + tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs, " + functions[i] + "_n_archs },\n" tempstring += " " + functions[i] + "_archs,\n" tempstring = strip_trailing(tempstring, ",") diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py index 7fbe9a8b0..de1f46aa6 100644 --- a/volk/include/volk/make_registry.py +++ b/volk/include/volk/make_registry.py @@ -1,5 +1,6 @@ from xml.dom import minidom from emit_omnilog import * +from volk_regexp import * import string def make_registry(dom, funclist, fcountlist, taglist) : @@ -39,24 +40,13 @@ def make_registry(dom, funclist, fcountlist, taglist) : tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; tags_counter = tags_counter + 1; - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = strip_trailing(tempstring, ",") tempstring = tempstring + "};\n\n"; for fcount in fcountlist: tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; - counter = counter + 1; - for arch_list in fcount: - tempstring = tempstring + " (LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + "_CNT"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " * LV_"; - tempstring = tempstring + ") + "; - lindex = tempstring.rfind(" + "); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); - tempstring = tempstring + ",\n" + counter += 1; for arch_list in fcount: tempstring = tempstring + "#if defined(LV_HAVE_" for ind in range(len(arch_list)): @@ -70,9 +60,22 @@ def make_registry(dom, funclist, fcountlist, taglist) : if ind < len(arch_list) - 1: tempstring = tempstring + ") + (1 << LV_" tempstring = tempstring + "),\n#endif\n" - lindex = tempstring.rfind(","); - tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = strip_trailing(tempstring, ",") tempstring = tempstring + "};\n\n" + + counter = 0; + for fcount in fcountlist: + tempstring += "static const int " + funclist[counter] + "_n_archs = " + counter += 1; + for arch_list in fcount: + tempstring = tempstring + " (LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind] + "_CNT"; + if ind < len(arch_list) - 1: + tempstring = tempstring + " * LV_"; + tempstring = tempstring + ") + "; + tempstring = strip_trailing(tempstring, " + "); + tempstring = tempstring + ";\n" tempstring = tempstring + emit_epilog(); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index e050600f0..0a47ff864 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -18,6 +18,7 @@ union bit128{ struct volk_func_desc { const char **indices; const int *arch_defs; + const int n_archs; }; #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From 668da8bd8874ae71a819d55f046e39c964a4270b Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 10:47:10 -0700 Subject: Volk: manual funcs implemented, QA code runs. Barfs due to missing Orc arch. --- volk/include/volk/make_c.py | 20 ++++++++++++++------ volk/include/volk/make_each_machine_c.py | 2 +- volk/include/volk/make_h.py | 7 +++++-- volk/include/volk/volk_register.py | 2 +- 4 files changed, 21 insertions(+), 10 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 1129285bb..ee9526860 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -36,6 +36,7 @@ def make_c(machines, archs, functions, arched_arglist, my_arglist): #include "volk_rank_archs.h" #include #include +#include """ tempstring += emit_prolog(); @@ -68,11 +69,15 @@ struct volk_machine *get_machine(void) { } } -static unsigned int get_index(const char **indices, char *arch_name) { - - - - +static unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { + int i; + for(i=0; i -generic 32|64 mmx sse sse2 +generic orc 32|64 mmx sse sse2 -generic 32|64 mmx sse sse2 sse3 +generic orc 32|64 mmx sse sse2 sse3 -generic 32|64 mmx sse sse2 sse3 ssse3 +generic orc 32|64 mmx sse sse2 sse3 ssse3 -generic 32|64 mmx sse sse2 sse3 sse4_a popcount +generic orc 32|64 mmx sse sse2 sse3 sse4_a popcount -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_1 -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx +generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx -generic altivec +generic orc altivec diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index ee9526860..591e8b64c 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -77,7 +77,8 @@ static unsigned int get_index(const char *indices[], unsigned int n_archs, const } } //something terrible should happen here - return 0; //but we'll fake it for now + printf("Volk warning: no arch found, returning generic impl\n"); + return get_index(indices, n_archs, "generic"); //but we'll fake it for now } """ diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 2b0bf1f48..4efbe9e8c 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -36,12 +36,11 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): #include """ + tempstring += emit_prolog(); for func in functions: tempstring += "#include \n" tempstring += "\n\n" - tempstring += emit_prolog(); - for i in range(len(functions)): tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index bd6283b58..eaff7f3c8 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -47,46 +47,52 @@ libvolk_la_SOURCES = \ volk_rank_archs.c \ volk_machines.cc + libvolk_la_LDFLAGS = +libvolk_la_LIBADD = -volk_orc_LDFLAGS = \ - $(ORC_LDFLAGS) \ - -lorc-0.4 +if LV_HAVE_ORC +volk_orc_CFLAGS = -DLV_HAVE_ORC=1 +volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 +volk_orc_LIBADD = ../orc/libvolk_orc.la +else +volk_orc_CFLAGS = +volk_orc_LDFLAGS = +volk_orc_LIBADD = +endif -volk_orc_LIBADD = \ - ../orc/libvolk_orc.la +libvolk_la_CPPFLAGS = $(AM_CPPFLAGS) $(volk_orc_CFLAGS) +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) + +noinst_LTLIBRARIES = """ #here be dragons - tempstring += "libvolk_la_LIBADD = \n" - tempstring += "libvolk_la_CPPFLAGS = $(AM_CPPFLAGS)\n" - tempstring += "noinst_LTLIBRARIES = \n" for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " + #tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" - tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " " - tempstring += "\nendif\n" + tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" + #tempstring += "if LV_HAVE_ORC\n" + #tempstring += "libvolk_" + machine_name + "_la_LIBADD = $(volk_orc_LIBADD)\n" + #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = $(volk_orc_LDFLAGS)\n" + #tempstring += "else\n" + #tempstring += "libvolk_" + machine_name + "_la_LIBADD = \n" + #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = \n" + #tempstring += "endif\n" + tempstring += "endif\n" - tempstring += """ -if LV_HAVE_ORC -libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD += $(volk_orc_LIBADD) -volk_orc_CFLAGS = -DLV_HAVE_ORC -else -libvolk_la_LDFLAGS += $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_la_LIBADD += -volk_orc_CFLAGS = -endif - + tempstring += r""" # ---------------------------------------------------------------- # The QA library. Note libvolk.la in LIBADD @@ -116,14 +122,8 @@ noinst_PROGRAMS = \ testqa_SOURCES = testqa.cc qa_utils.cc testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) -if LV_HAVE_ORC -testqa_LDADD = \ - libvolk.la \ - ../orc/libvolk_orc.la -else testqa_LDADD = \ libvolk.la -endif distclean-local: rm -f volk.c -- cgit From cd0cc571d90fd2e0b8c9376ce07150db5f405fb7 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 15:38:57 -0700 Subject: Volk: Compiles and runs with or without Orc installed. --- volk/include/volk/machines.xml | 16 ++++++++-------- volk/include/volk/make_each_machine_c.py | 10 +++++----- volk/include/volk/make_makefile_am.py | 7 ------- 3 files changed, 13 insertions(+), 20 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml index 9bf838527..ad71da177 100644 --- a/volk/include/volk/machines.xml +++ b/volk/include/volk/machines.xml @@ -15,35 +15,35 @@ --> -generic orc 32|64 mmx sse sse2 +generic 32|64 mmx sse sse2 -generic orc 32|64 mmx sse sse2 sse3 +generic 32|64 mmx sse sse2 sse3 -generic orc 32|64 mmx sse sse2 sse3 ssse3 +generic 32|64 mmx sse sse2 sse3 ssse3 -generic orc 32|64 mmx sse sse2 sse3 sse4_a popcount +generic 32|64 mmx sse sse2 sse3 sse4_a popcount -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_1 +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount -generic orc 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx -generic orc altivec +generic altivec diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py index 4efbe9e8c..94d6d7789 100644 --- a/volk/include/volk/make_each_machine_c.py +++ b/volk/include/volk/make_each_machine_c.py @@ -46,12 +46,12 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): tags_counter = 0 for arch_list in fcountlist[i]: - ok = True + tempstring += "#if " for arch in arch_list: - if arch.swapcase() not in archs: - ok = False - if ok: - tempstring += " " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tempstring += "defined(LV_HAVE_" + arch + ") && " + tempstring = strip_trailing(tempstring, " && ") + tempstring += "\n " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" + tempstring += "#endif\n" tags_counter += 1 tempstring = strip_trailing(tempstring, ",") diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index eaff7f3c8..c73730859 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -82,13 +82,6 @@ noinst_LTLIBRARIES = tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" - #tempstring += "if LV_HAVE_ORC\n" - #tempstring += "libvolk_" + machine_name + "_la_LIBADD = $(volk_orc_LIBADD)\n" - #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = $(volk_orc_LDFLAGS)\n" - #tempstring += "else\n" - #tempstring += "libvolk_" + machine_name + "_la_LIBADD = \n" - #tempstring += "libvolk_" + machine_name + "_la_LDFLAGS = \n" - #tempstring += "endif\n" tempstring += "endif\n" -- cgit From 513020d87912569691601cc2c49ca3331959fa63 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sat, 16 Apr 2011 23:05:09 -0700 Subject: Volk: modified archs.xml to put Orc higher prio than old SSE/SSE2, since Orc gives better results than those platforms on avg Test cases changed to take ~1s each on my i7 laptop --- volk/include/volk/archs.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index 2547df590..977cc7924 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -5,12 +5,6 @@ none - - lorc-0.4 - LV_HAVE_ORC - no - - maltivec @@ -83,6 +77,12 @@ msse2 + + lorc-0.4 + LV_HAVE_ORC + no + + 1 1 -- cgit From a484de90ccf45ca7029497a66d089bd4c781c40f Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Sun, 17 Apr 2011 16:11:11 -0700 Subject: Volk: make_makefile_am.py changes for include dirs --- volk/include/volk/Makefile.am | 1 + volk/include/volk/make_makefile_am.py | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 876bb7a60..9a665f59e 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -33,6 +33,7 @@ volkinclude_HEADERS = \ volk_registry.h \ volk.h \ volk_cpu.h \ + volk_machines.h \ volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index c73730859..1e43634f6 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -47,10 +47,6 @@ libvolk_la_SOURCES = \ volk_rank_archs.c \ volk_machines.cc - -libvolk_la_LDFLAGS = -libvolk_la_LIBADD = - if LV_HAVE_ORC volk_orc_CFLAGS = -DLV_HAVE_ORC=1 volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 @@ -73,8 +69,7 @@ noinst_LTLIBRARIES = for machine_name in machines: tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include $(volk_orc_CFLAGS) " - #tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_builddir)/include " + tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_srcdir)/include $(volk_orc_CFLAGS) " for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " -- cgit From 208dcc510e6b6beedf2479a598a90d32c19e1274 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 17:29:45 -0700 Subject: volk: updated lib and include .gitignores for in-tree build --- volk/include/volk/.gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index be8358f3a..1afe439ba 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -9,12 +9,9 @@ /Makefile /Makefile.in /volk.h -/volk_config.h /volk_config_fixed.h /volk_cpu.h /volk_environment_init.h /volk_registry.h -/volk_runtime.h -/volk_tables.h /volk_typedefs.h -/volk_mktables +/volk_machines.h -- cgit From a56b291edf063604ea40391f40771eda99d8eb40 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 17:39:15 -0700 Subject: volk: added attributes header (copied from gruel) --- volk/include/volk/Makefile.am | 1 + volk/include/volk/make_h.py | 1 + volk/include/volk/volk_attributes.h | 56 +++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 volk/include/volk/volk_attributes.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9a665f59e..a4440a980 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,6 +26,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ + volk_attributes.h \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index bf449719d..81928e6b5 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -12,6 +12,7 @@ def make_h(funclist, arched_arglist) : tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + emit_prolog(); diff --git a/volk/include/volk/volk_attributes.h b/volk/include/volk/volk_attributes.h new file mode 100644 index 000000000..5345c1535 --- /dev/null +++ b/volk/include/volk/volk_attributes.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_ATTRIBUTES_H +#define INCLUDED_VOLK_ATTRIBUTES_H + +//////////////////////////////////////////////////////////////////////// +// Cross-platform attribute macros +//////////////////////////////////////////////////////////////////////// +#if defined __GNUC__ +# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +# define __VOLK_ATTR_UNUSED __attribute__((unused)) +# define __VOLK_ATTR_INLINE __attribute__((always_inline)) +# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# if __GNUC__ >= 4 +# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +# else +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +# endif +#elif _MSC_VER +# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE __forceinline +# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +# define __VOLK_ATTR_EXPORT __declspec(dllexport) +# define __VOLK_ATTR_IMPORT __declspec(dllimport) +#else +# define __VOLK_ATTR_ALIGNED(x) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE +# define __VOLK_ATTR_DEPRECATED +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +#endif + +#endif /* INCLUDED_VOLK_ATTRIBUTES_H */ -- cgit From 0b3e4f25eea1eef3e8a45fdb6d1bcc1ec57d1321 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 23:03:18 -0700 Subject: volk: replace references to __attribute__((aligned... with cross platform macro This replaces all aligned(16) and aligned(128) (which were incorrect). Attribute macros are on the left because this makes gcc and ms compilers happy. --- volk/include/volk/volk_16ic_magnitude_16i_a16.h | 9 +++++---- volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 3 ++- volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h | 3 ++- volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 3 ++- volk/include/volk/volk_32f_index_max_16u_a16.h | 9 +++++---- .../volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 2 +- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 2 +- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 3 ++- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 2 +- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 5 +++-- volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 9 +++++---- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 7 ++++--- volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 6 +++--- volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 3 ++- volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h | 3 ++- volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h | 5 +++-- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 5 +++-- volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 2 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 3 ++- volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 3 ++- volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 3 ++- 25 files changed, 63 insertions(+), 43 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index e75d54ec4..00d29b112 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H #define INCLUDED_volk_16ic_magnitude_16i_a16_H +#include #include #include #include @@ -25,8 +26,8 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co __m128 cplxValue1, cplxValue2, result; - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ @@ -96,8 +97,8 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con __m128 cplxValue1, cplxValue2, iValue, qValue, result; - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index dcb2499fa..a4f0689e5 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#include #include #include @@ -25,7 +26,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index f21fe77f8..564aa1f5d 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#include #include #include @@ -72,7 +73,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, __m128 invScalar = _mm_set_ps1(iScalar); int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index 388d2ebcd..637ba9fd0 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#include #include #include #include @@ -25,7 +26,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, __m128 cplxValue1, cplxValue2, result; - float inputFloatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; for(;number < quarterPoints; number++){ @@ -91,7 +92,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, __m128 cplxValue1, cplxValue2, result, re, im; - float inputFloatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; for(;number < quarterPoints; number++){ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 6a85e066e..94aff3a49 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H #define INCLUDED_volk_32f_accumulator_s32f_a16_H +#include #include #include @@ -18,7 +19,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* const unsigned int quarterPoints = num_points / 4; const float* aPtr = inputBuffer; - float tempBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index 3934d2db7..5c19bfca0 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_index_max_16u_a16_H #define INCLUDED_volk_32f_index_max_16u_a16_H +#include #include #include #include @@ -25,8 +26,8 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const __m128 compareResults; __m128 currentValues; - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; for(;number < quarterPoints; number++){ @@ -83,8 +84,8 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl __m128 compareResults; __m128 currentValues; - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; for(;number < quarterPoints; number++){ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 55d4e0319..70ab3ccdb 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#include #include #include @@ -21,7 +22,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no const unsigned int quarterPoints = num_points / 4; const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; __m128 dataPointsVal; __m128 avgPointsVal = _mm_setzero_ps(); @@ -87,7 +88,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no sumMean += avgPointsVector[3]; // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; _mm_store_ps(validBinCountVector, vValidBinCount); float validBinCount = 0; diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index 9d1d0ef4d..71b53ba3a 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H #define INCLUDED_volk_32f_s32f_convert_16i_a16_H +#include #include #include @@ -63,7 +64,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, cons __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index 06228ef7d..dec3f1611 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 82c74bf44..095d7bd35 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H #define INCLUDED_volk_32f_s32f_convert_32i_a16_H +#include #include #include @@ -59,7 +60,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, cons __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index 253a48ae3..b4e954dc4 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -61,7 +61,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index 8dab0cdf4..509a46609 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H #define INCLUDED_volk_32f_s32f_convert_8i_a16_H +#include #include #include @@ -70,7 +71,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_load_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 72b193c9d..1c6bf87c9 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -72,7 +72,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; - float outputFloatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; for(;number < quarterPoints; number++){ ret = _mm_loadu_ps(inputVectorPtr); diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 48d2fe1fe..779ae2d39 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H #define INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#include #include #include #include @@ -22,7 +23,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa const float* aPtr = inputBuffer; - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 squareAccumulator = _mm_setzero_ps(); __m128 aVal1, aVal2, aVal3, aVal4; @@ -82,7 +83,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* const float* aPtr = inputBuffer; - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 squareAccumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index f1cb2ae0e..9605322d3 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#include #include #include #include @@ -22,8 +23,8 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 squareAccumulator = _mm_setzero_ps(); @@ -95,8 +96,8 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* const unsigned int quarterPoints = num_points / 4; const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; __m128 accumulator = _mm_setzero_ps(); __m128 squareAccumulator = _mm_setzero_ps(); diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index d13f12e51..93151260f 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H #define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#include #include @@ -53,7 +54,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -102,7 +103,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -163,7 +164,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const flo dotProdVal = _mm_add_ps(dotProdVal, cVal1); } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 7c1136a67..7f47122ff 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -102,7 +102,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * bPtr += 4; } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector @@ -163,7 +163,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float dotProdVal = _mm_add_ps(dotProdVal, cVal1); } - float dotProductVector[4] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index e98735245..cab3db50d 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#include #include #include @@ -85,7 +86,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe int16_t* complexVectorPtr = (int16_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ iValue = _mm_load_ps(iBufferPtr); diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 1e3e61e08..304515a5c 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#include #include #include @@ -24,7 +25,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer __m128 cplxValue1, cplxValue2, iValue; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 14318ab01..96afa5ae9 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#include #include #include #include @@ -25,7 +26,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto __m128 cplxValue1, cplxValue2, result; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); @@ -80,7 +81,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector __m128 cplxValue1, cplxValue2, iValue, qValue, result; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ cplxValue1 = _mm_load_ps(complexVectorPtr); diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index d78faf5b5..78e28c903 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#include #include #include @@ -64,7 +65,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -205,7 +206,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul #if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; int bound = num_bytes >> 4; int leftovers = num_bytes % 16; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 69781f0fb..73576a766 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -66,7 +66,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* res static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; union HalfMask { uint32_t intRep[4]; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index b7b9768ab..d404ee684 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#include #include #include #include @@ -358,7 +359,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const b += 2; } - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index 80032d2fe..d68d2462a 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#include #include #include @@ -95,7 +96,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo __m128 invScalar = _mm_set_ps1(1.0/scalar); int8_t* complexVectorPtr = (int8_t*)complexVector; - float floatBuffer[8] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(complexVectorPtr[0]); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index 47a968ac1..d2cfa42f6 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#include #include #include @@ -81,7 +82,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c __m128 invScalar = _mm_set_ps1(iScalar); int8_t* complexVectorPtr = (int8_t*)complexVector; - float floatBuffer[4] __attribute__((aligned(128))); + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; -- cgit From b31f891208f8a1c3605f4db2297cc6b6f4b37aca Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Apr 2011 19:37:44 -0700 Subject: volk: added VOLK_API macro to external symbols --- volk/include/volk/make_h.py | 14 +++++++++++--- volk/include/volk/make_makefile_am.py | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py index 81928e6b5..07e62939b 100644 --- a/volk/include/volk/make_h.py +++ b/volk/include/volk/make_h.py @@ -2,7 +2,14 @@ from xml.dom import minidom from emit_omnilog import * from volk_regexp import * - +# http://gcc.gnu.org/wiki/Visibility +volk_api_defines = """ +#ifdef volk_EXPORTS +# define VOLK_API __VOLK_ATTR_EXPORT +#else +# define VOLK_API __VOLK_ATTR_IMPORT +#endif +""" def make_h(funclist, arched_arglist) : tempstring = ""; @@ -14,15 +21,16 @@ def make_h(funclist, arched_arglist) : tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; tempstring = tempstring + '#include\n'; + tempstring = tempstring + volk_api_defines tempstring = tempstring + emit_prolog(); tempstring = tempstring + '\n'; for i in range(len(funclist)): tempstring += "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" - tempstring += "extern void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) + tempstring += "extern VOLK_API void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) tempstring = strip_trailing(tempstring, " {") - tempstring += "extern struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) + tempstring += "extern VOLK_API struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) tempstring = tempstring + emit_epilog(); tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index 1e43634f6..af4467f37 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -26,6 +26,8 @@ include $(top_srcdir)/Makefile.common AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ -I$(top_builddir)/include \ + -Dvolk_EXPORTS \ + -fvisibility=hidden \ $(WITH_INCLUDES) lib_LTLIBRARIES = \ -- cgit From de771b0303298e7761f3c5350323565d05c5ee6d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Mon, 18 Apr 2011 10:11:57 -0700 Subject: Volk: remove some dead files from Makefile.am --- volk/include/volk/make_makefile_am.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py index af4467f37..2ff492b34 100644 --- a/volk/include/volk/make_makefile_am.py +++ b/volk/include/volk/make_makefile_am.py @@ -35,7 +35,6 @@ lib_LTLIBRARIES = \ EXTRA_DIST = \ volk_rank_archs.h \ - volk_proccpu_sim.c \ gcc_x86_cpuid.h # ---------------------------------------------------------------- -- cgit From d941ba31677804fe382d76fca17fc044d12777f5 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 26 Apr 2011 21:41:34 -0700 Subject: volk: removed cppunit from the build (not used anymore) --- volk/include/volk/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index a4440a980..b36d96c5d 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -20,7 +20,7 @@ include $(top_srcdir)/Makefile.common -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ $(LV_CXXFLAGS) $(WITH_INCLUDES) volkincludedir = $(prefix)/include/volk -- cgit From a5e2d9e5baf869ae961fbb5820447290d6d9c7c8 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 26 Apr 2011 21:55:48 -0700 Subject: volk: reorganization of generation sources and generated files All generation sources have been moved to the gen/ subdirectory. Bootstrap and volk_register.py generate the files into to gen/ subdirectory in an effort to cleanly separate the static/generated parts of the build tree. Define top_gendir in Makefile.common, all generated sources listed in Makefile.ams are prefixed with $(top_gendir) to differentiate them from static in-tree sources. --- volk/include/volk/.gitignore | 15 -- volk/include/volk/Makefile.am | 28 +-- volk/include/volk/archs.xml | 137 ------------- volk/include/volk/emit_omnilog.py | 13 -- volk/include/volk/machines.xml | 49 ----- volk/include/volk/make_c.py | 100 --------- volk/include/volk/make_config_fixed.py | 21 -- volk/include/volk/make_config_in.py | 13 -- volk/include/volk/make_cpuid_c.py | 184 ----------------- volk/include/volk/make_cpuid_h.py | 48 ----- volk/include/volk/make_each_machine_c.py | 82 -------- volk/include/volk/make_environment_init_c.py | 32 --- volk/include/volk/make_environment_init_h.py | 18 -- volk/include/volk/make_h.py | 39 ---- volk/include/volk/make_machines_c.py | 57 ------ volk/include/volk/make_machines_h.py | 50 ----- volk/include/volk/make_makefile_am.py | 131 ------------ volk/include/volk/make_proccpu_sim.py | 47 ----- volk/include/volk/make_registry.py | 85 -------- volk/include/volk/make_set_simd.py | 166 --------------- volk/include/volk/make_typedefs.py | 23 --- volk/include/volk/volk_regexp.py | 14 -- volk/include/volk/volk_register.py | 291 --------------------------- 23 files changed, 7 insertions(+), 1636 deletions(-) delete mode 100644 volk/include/volk/archs.xml delete mode 100644 volk/include/volk/emit_omnilog.py delete mode 100644 volk/include/volk/machines.xml delete mode 100644 volk/include/volk/make_c.py delete mode 100644 volk/include/volk/make_config_fixed.py delete mode 100644 volk/include/volk/make_config_in.py delete mode 100644 volk/include/volk/make_cpuid_c.py delete mode 100644 volk/include/volk/make_cpuid_h.py delete mode 100644 volk/include/volk/make_each_machine_c.py delete mode 100644 volk/include/volk/make_environment_init_c.py delete mode 100644 volk/include/volk/make_environment_init_h.py delete mode 100644 volk/include/volk/make_h.py delete mode 100644 volk/include/volk/make_machines_c.py delete mode 100644 volk/include/volk/make_machines_h.py delete mode 100644 volk/include/volk/make_makefile_am.py delete mode 100644 volk/include/volk/make_proccpu_sim.py delete mode 100644 volk/include/volk/make_registry.py delete mode 100644 volk/include/volk/make_set_simd.py delete mode 100644 volk/include/volk/make_typedefs.py delete mode 100644 volk/include/volk/volk_regexp.py delete mode 100755 volk/include/volk/volk_register.py (limited to 'volk/include') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index 1afe439ba..b336cc7ce 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -1,17 +1,2 @@ -/*.cache -/*.la -/*.lo -/*.pc -/.deps -/.la -/.libs -/.lo /Makefile /Makefile.in -/volk.h -/volk_config_fixed.h -/volk_cpu.h -/volk_environment_init.h -/volk_registry.h -/volk_typedefs.h -/volk_machines.h diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index b36d96c5d..ea60d201a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -29,13 +29,13 @@ volkinclude_HEADERS = \ volk_attributes.h \ volk_complex.h \ volk_common.h \ - volk_config_fixed.h \ - volk_typedefs.h \ - volk_registry.h \ - volk.h \ - volk_cpu.h \ - volk_machines.h \ - volk_environment_init.h \ + $(top_gendir)/include/volk/volk_config_fixed.h \ + $(top_gendir)/include/volk/volk_typedefs.h \ + $(top_gendir)/include/volk/volk_registry.h \ + $(top_gendir)/include/volk/volk.h \ + $(top_gendir)/include/volk/volk_cpu.h \ + $(top_gendir)/include/volk/volk_machines.h \ + $(top_gendir)/include/volk/volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ volk_16ic_deinterleave_16i_x2_a16.h \ @@ -124,17 +124,3 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a16.h \ volk_8i_s32f_convert_32f_u.h - -distclean-local: - rm -f volk_config_fixed.h - rm -f volk_cpu.h - rm -f volk.h - rm -f volk_registry.h - rm -f volk_runtime.h - rm -f volk_typedefs.h - rm -f volk_tables.h - rm -f *.pyc - rm -f Makefile.in - rm -f volk_environment_init.h - rm -f volk_mktables - rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml deleted file mode 100644 index 977cc7924..000000000 --- a/volk/include/volk/archs.xml +++ /dev/null @@ -1,137 +0,0 @@ - - - - - none - - - - maltivec - - - - m32 - MD_SUBCPU - x86_64 - - - - 0x80000001 - d - 29 - m64 - 1 - MD_SUBCPU - x86 - - - - 0x80000001 - d - 31 - m3dnow - 1 - - - - 1 - 0x80000001 - d - 5 - sse4.2 - - - - 1 - 1 - c - 23 - mpopcnt - - - - 1 - 1 - d - 23 - mmmx - - - - - 1 - 1 - d - 25 - msse - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - xmmintrin.h - - - - - 1 - 1 - d - 26 - msse2 - - - - lorc-0.4 - LV_HAVE_ORC - no - - - - 1 - 1 - c - 0 - msse3 - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); - pmmintrin.h - - - - 1 - 1 - c - 9 - mssse3 - - - - 1 - 0x80000001 - c - 6 - msse4a - - - - - 1 - 1 - c - 19 - msse4.1 - - - - 1 - 1 - c - 20 - msse4.2 - - - - 1 - 1 - c - 28 - mavx - - - diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py deleted file mode 100644 index 309d7e578..000000000 --- a/volk/include/volk/emit_omnilog.py +++ /dev/null @@ -1,13 +0,0 @@ -def emit_prolog(): - tempstring = ""; - tempstring = tempstring + '#ifdef __cplusplus\n'; - tempstring = tempstring + 'extern "C" {\n'; - tempstring = tempstring + '#endif\n'; - return tempstring; -def emit_epilog(): - tempstring = ""; - tempstring = tempstring + '#ifdef __cplusplus\n'; - tempstring = tempstring + '}\n'; - tempstring = tempstring + '#endif\n'; - return tempstring; - diff --git a/volk/include/volk/machines.xml b/volk/include/volk/machines.xml deleted file mode 100644 index ad71da177..000000000 --- a/volk/include/volk/machines.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - -generic - - - - - -generic 32|64 mmx sse sse2 - - - -generic 32|64 mmx sse sse2 sse3 - - - -generic 32|64 mmx sse sse2 sse3 ssse3 - - - -generic 32|64 mmx sse sse2 sse3 sse4_a popcount - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount - - - -generic 32|64 mmx sse sse2 sse3 ssse3 sse4_2 popcount avx - - - -generic altivec - - - diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py deleted file mode 100644 index 591e8b64c..000000000 --- a/volk/include/volk/make_c.py +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * -import string -from emit_omnilog import * - -#ok todo list: -#put n_archs into the info struct so it doesn't have to be arch_defs[0]. - -def make_c(machines, archs, functions, arched_arglist, my_arglist): - tempstring = r""" -// This file is automatically generated by make_c.py. -// Do not edit this file. -""" - tempstring += """ -#include -#include -#include -#include -#include -#include "volk_rank_archs.h" -#include -#include -#include - -""" - tempstring += emit_prolog(); - -#OK here's the deal. the .h prototypes the functions. the .c impls them as fptrs, can use p_whatever. -#also .c impls the get_machine call -#also .c impls the default call for each fn - -#here do static fn get arch - tempstring += r""" -struct volk_machine *get_machine(void) { - extern struct volk_machine volk_machines[]; - extern unsigned int n_volk_machines; - static struct volk_machine *machine = NULL; - - if(machine != NULL) return machine; - else { - unsigned int max_score = 0; - int i; - for(i=0; i max_score) { - max_score = volk_machines[i].caps; - machine = &(volk_machines[i]); - } - } - } - printf("Using Volk machine: %s\n", machine->name); - return machine; - } -} - -static unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { - int i; - for(i=0; i -#include - -struct VOLK_CPU volk_cpu; - -#if defined(__i386__) || (__x86_64__) -#include -#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3) - -static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[0]; -} - -static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[1]; -} - -static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[2]; -} - -static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int regs[4]; - cpuid_x86 (op, regs); - return regs[3]; -} -#endif - -""" - -def make_cpuid_c(dom) : - tempstring = HEADER_TEMPL; - - for domarch in dom: - if str(domarch.attributes["type"].value) == "x86": - if "no_test" in domarch.attributes.keys(): - no_test = str(domarch.attributes["no_test"].value); - if no_test == "true": - no_test = True; - else: - no_test = False; - else: - no_test = False; - arch = str(domarch.attributes["name"].value); - op = domarch.getElementsByTagName("op"); - if op: - op = str(op[0].firstChild.data); - reg = domarch.getElementsByTagName("reg"); - if reg: - reg = str(reg[0].firstChild.data); - shift = domarch.getElementsByTagName("shift"); - if shift: - shift = str(shift[0].firstChild.data); - val = domarch.getElementsByTagName("val"); - if val: - val = str(val[0].firstChild.data); - - if no_test: - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - return 1; -#else - return 0; -#endif -} - -""" % (arch) - - elif op == "1": - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - unsigned int e%sx = cpuid_e%sx (%s); - return ((e%sx >> %s) & 1) == %s; -#else - return 0; -#endif -} - -""" % (arch, reg, reg, op, reg, shift, val) - - elif op == "0x80000001": - tempstring = tempstring + """\ -int i_can_has_%s () { -#if defined(__i386__) || (__x86_64__) - unsigned int extended_fct_count = cpuid_eax(0x80000000); - if (extended_fct_count < 0x80000001) - return %s^1; - unsigned int extended_features = cpuid_e%sx (%s); - return ((extended_features >> %s) & 1) == %s; -#else - return 0; -#endif -} - -""" % (arch, val, reg, op, shift, val) - - elif str(domarch.attributes["type"].value) == "powerpc": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { -#ifdef __PPC__ - return 1; -#else - return 0; -#endif -} - -""" % (arch) - - elif str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { - return 1; -} - -""" % (arch) - else: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + """\ -int i_can_has_%s () { - return 0; -} - -""" % (arch) - - tempstring = tempstring + "void volk_cpu_init() {\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" - tempstring = tempstring + "}\n\n" - - tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; - tempstring = tempstring + " unsigned int retval = 0;\n" - tempstring = tempstring + " volk_cpu_init();\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" - tempstring = tempstring + " return retval;\n" - tempstring = tempstring + "}\n\n" - - return tempstring; - - - - - - - diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py deleted file mode 100644 index cd3da2455..000000000 --- a/volk/include/volk/make_cpuid_h.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom -from emit_omnilog import * - -def make_cpuid_h(dom) : - tempstring = ""; - tempstring = tempstring +'/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H'; - tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n'; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + '\n' - - tempstring = tempstring + "struct VOLK_CPU {\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "};\n\n"; - tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; - - tempstring = tempstring + "void volk_cpu_init ();\n" - tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" - - tempstring = tempstring + "\n"; - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n" - - return tempstring; diff --git a/volk/include/volk/make_each_machine_c.py b/volk/include/volk/make_each_machine_c.py deleted file mode 100644 index 94d6d7789..000000000 --- a/volk/include/volk/make_each_machine_c.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * -import string -from emit_omnilog import * - -#ok todo list: -#put n_archs into the info struct so it doesn't have to be arch_defs[0]. - -def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist): - tempstring = r""" -// This file is automatically generated by make_each_machine_c.py. -// Do not edit this file. -""" - for arch in archs: - tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" - - tempstring += """ -#include -#include -#include - -""" - tempstring += emit_prolog(); - for func in functions: - tempstring += "#include \n" - tempstring += "\n\n" - - for i in range(len(functions)): - tempstring += "static const " + replace_volk.sub("p", functions[i]) + " " + functions[i] + "_archs[] = {\n" - - tags_counter = 0 - for arch_list in fcountlist[i]: - tempstring += "#if " - for arch in arch_list: - tempstring += "defined(LV_HAVE_" + arch + ") && " - tempstring = strip_trailing(tempstring, " && ") - tempstring += "\n " + functions[i] + "_" + str(taglist[i][tags_counter]) + ",\n" - tempstring += "#endif\n" - tags_counter += 1 - - tempstring = strip_trailing(tempstring, ",") - tempstring += "};\n\n" - - - tempstring += "static unsigned int caps = " - for arch in archs: - tempstring += "(1 << LV_" + arch.swapcase() + ") + " - tempstring = strip_trailing(tempstring, " + ") - tempstring += ";\n" - - tempstring += "static const char* name = \"" + machine_name + "\";\n" - tempstring += "struct volk_machine volk_machine_" + machine_name + " = {\n" - tempstring += " caps,\n" - tempstring += " name,\n" - - for i in range(len(functions)): - tempstring += " { " + functions[i] + "_indices, " + functions[i] + "_arch_defs, " + functions[i] + "_n_archs },\n" - tempstring += " " + functions[i] + "_archs,\n" - - tempstring = strip_trailing(tempstring, ",") - tempstring += "};\n" - tempstring += emit_epilog(); - - return tempstring - - diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py deleted file mode 100644 index 263d5bcd1..000000000 --- a/volk/include/volk/make_environment_init_c.py +++ /dev/null @@ -1,32 +0,0 @@ -from xml.dom import minidom - -def make_environment_init_c(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include\n" - for domarch in dom: - arch = str(domarch.attributes["name"].value); - incs = domarch.getElementsByTagName("include"); - for inc in incs: - my_inc = str(inc.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + "#include<" + my_inc + ">\n"; - tempstring = tempstring + "#endif\n" - tempstring = tempstring + '\n\n'; - tempstring = tempstring + "void volk_environment_init(){\n" - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - envs = domarch.getElementsByTagName("environment"); - for env in envs: - cmd = str(env.firstChild.data); - tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring + " " + cmd + "\n"; - tempstring = tempstring + "#endif\n" - - tempstring = tempstring + "}\n"; - return tempstring; - - - - diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py deleted file mode 100644 index 77a841a24..000000000 --- a/volk/include/volk/make_environment_init_h.py +++ /dev/null @@ -1,18 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * - -def make_environment_init_h() : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; - tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; - tempstring = tempstring + "\n"; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + "void volk_environment_init();\n"; - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif\n" - return tempstring; - - - - diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py deleted file mode 100644 index 07e62939b..000000000 --- a/volk/include/volk/make_h.py +++ /dev/null @@ -1,39 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * - -# http://gcc.gnu.org/wiki/Visibility -volk_api_defines = """ -#ifdef volk_EXPORTS -# define VOLK_API __VOLK_ATTR_EXPORT -#else -# define VOLK_API __VOLK_ATTR_IMPORT -#endif -""" - -def make_h(funclist, arched_arglist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by make_h.py*/\n'; - - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + '#include\n'; - tempstring = tempstring + volk_api_defines - tempstring = tempstring + emit_prolog(); - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring += "extern " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n" - tempstring += "extern VOLK_API void %s_manual%s;\n" % (funclist[i], arched_arglist[i]) - tempstring = strip_trailing(tempstring, " {") - tempstring += "extern VOLK_API struct volk_func_desc %s_get_func_desc(void);\n" % (funclist[i]) - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_machines_c.py b/volk/include/volk/make_machines_c.py deleted file mode 100644 index 9ad56fb62..000000000 --- a/volk/include/volk/make_machines_c.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * - -def make_machines_c(machines): - tempstring = r""" -// This file is automatically generated by make_machines_c.py. -// Do not edit this file. - -#include -#include -#include - -struct volk_machine volk_machines[] = { -""" - for machine in machines: - tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" - tempstring += "volk_machine_" + machine - tempstring += "," - tempstring += "\n#endif\n" - - tempstring += r""" -}; - -""" - - for machine in machines: - tempstring += "#if LV_MACHINE_" + machine.swapcase() + "\n" - tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 1\n" - tempstring += "#else\n" - tempstring += "#define LV_MACHINE_" + machine.swapcase() + "_CNT 0\n" - tempstring += "#endif\n" - - tempstring += """unsigned int n_volk_machines = -""" - for machine in machines: - tempstring += "(LV_MACHINE_" + machine.swapcase() + "_CNT) " - tempstring += "+ " - tempstring = tempstring[:-2] - tempstring += ";\n" - - return tempstring diff --git a/volk/include/volk/make_machines_h.py b/volk/include/volk/make_machines_h.py deleted file mode 100644 index 674ee12cd..000000000 --- a/volk/include/volk/make_machines_h.py +++ /dev/null @@ -1,50 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from volk_regexp import * - -def make_machines_h(functions, machines): - tempstring = r""" -// This file is automatically generated by make_machines_h.py. -// Do not edit this file. - -#ifndef INCLUDED_LIBVOLK_MACHINES_H -#define INCLUDED_LIBVOLK_MACHINES_H - -#include -#include - -struct volk_machine { - const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) - const char *name; -""" - for function in functions: - tempstring += "\n const struct volk_func_desc " + function + "_desc;\n" - tempstring += " const " + replace_volk.sub("p", function) + " *" + function + "_archs;\n" - - tempstring += r"""}; - -""" - for machine in machines: - tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" - tempstring += "extern const struct volk_machine volk_machine_" + machine + ";\n" - tempstring += """#endif\n""" - - tempstring += r""" -#endif //INCLUDED_LIBVOLK_MACHINES_H""" - - return tempstring diff --git a/volk/include/volk/make_makefile_am.py b/volk/include/volk/make_makefile_am.py deleted file mode 100644 index 2ff492b34..000000000 --- a/volk/include/volk/make_makefile_am.py +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from xml.dom import minidom - -def make_makefile_am(dom, machines, archflags_dict): - tempstring = r""" -# This file is automatically generated by make_makefile_am.py. -# Do not edit this file. - -include $(top_srcdir)/Makefile.common - -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ - -I$(top_builddir)/include \ - -Dvolk_EXPORTS \ - -fvisibility=hidden \ - $(WITH_INCLUDES) - -lib_LTLIBRARIES = \ - libvolk.la - -EXTRA_DIST = \ - volk_rank_archs.h \ - gcc_x86_cpuid.h - -# ---------------------------------------------------------------- -# The main library -# ---------------------------------------------------------------- - -libvolk_la_SOURCES = \ - $(platform_CODE) \ - volk.cc \ - volk_cpu.c \ - volk_rank_archs.c \ - volk_machines.cc - -if LV_HAVE_ORC -volk_orc_CFLAGS = -DLV_HAVE_ORC=1 -volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 -volk_orc_LIBADD = ../orc/libvolk_orc.la -else -volk_orc_CFLAGS = -volk_orc_LDFLAGS = -volk_orc_LIBADD = -endif - -libvolk_la_CPPFLAGS = $(AM_CPPFLAGS) $(volk_orc_CFLAGS) -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) -libvolk_la_LIBADD = $(volk_orc_LIBADD) - -noinst_LTLIBRARIES = - -""" - - #here be dragons - for machine_name in machines: - tempstring += "if LV_MACHINE_" + machine_name.swapcase() + "\n" - tempstring += "libvolk_" + machine_name + "_la_SOURCES = volk_machine_" + machine_name + ".cc\n" - tempstring += "libvolk_" + machine_name + "_la_CPPFLAGS = -I$(top_srcdir)/include $(volk_orc_CFLAGS) " - for arch in machines[machine_name]: - if archflags_dict[arch] != "none": - tempstring += "-" + archflags_dict[arch] + " " - - tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " - tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" - tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" - tempstring += "endif\n" - - - tempstring += r""" - -# ---------------------------------------------------------------- -# The QA library. Note libvolk.la in LIBADD -# ---------------------------------------------------------------- -#libvolk_qa_la_SOURCES = \ -# qa_utils.cc - -#libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lboost - -#libvolk_qa_la_LIBADD = \ -# libvolk.la \ -# libvolk_runtime.la - -# ---------------------------------------------------------------- -# headers that don't get installed -# ---------------------------------------------------------------- -noinst_HEADERS = \ - volk_init.h \ - qa_utils.h - -# ---------------------------------------------------------------- -# Our test program -# ---------------------------------------------------------------- -noinst_PROGRAMS = \ - testqa - -testqa_SOURCES = testqa.cc qa_utils.cc -testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) -testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) -testqa_LDADD = \ - libvolk.la - -distclean-local: - rm -f volk.c - rm -f volk_cpu_generic.c - rm -f volk_cpu_powerpc.c - rm -f volk_cpu_x86.c - rm -f volk_init.c - rm -f volk_init.h - rm -f volk_mktables.c - rm -f volk_proccpu_sim.c - rm -f volk_tables.h - rm -f volk_environment_init.c -""" - - - return tempstring diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py deleted file mode 100644 index 029dacfcc..000000000 --- a/volk/include/volk/make_proccpu_sim.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -from xml.dom import minidom - -def make_proccpu_sim(dom) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" - - tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; - tempstring = tempstring + " if(val==1){\n"; - tempstring = tempstring + " sprintf(buf, \"%s %s\", buf, newkey);\n"; - tempstring = tempstring + " }\n"; - tempstring = tempstring + "}\n"; - tempstring = tempstring + "\n\n"; - - tempstring = tempstring + "int main() {\n"; - tempstring = tempstring + " volk_cpu_init();\n"; - tempstring = tempstring + " char buf[2048];\n"; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n" - tempstring = tempstring + " printf(\"%s\\n\", buf);\n" - tempstring = tempstring + "}\n" - return tempstring; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py deleted file mode 100644 index de1f46aa6..000000000 --- a/volk/include/volk/make_registry.py +++ /dev/null @@ -1,85 +0,0 @@ -from xml.dom import minidom -from emit_omnilog import * -from volk_regexp import * -import string - -def make_registry(dom, funclist, fcountlist, taglist) : - tempstring = ""; - tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; - tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; - tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; - tempstring = tempstring +'#include\n'; - tempstring = tempstring + emit_prolog(); - tempstring = tempstring + '\n' - - - - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring +"#ifdef LV_HAVE_" + arch.swapcase() + "\n"; - tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; - tempstring = tempstring +"#else\n"; - tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; - tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; - - counter = 0; - - for i in range(len(funclist)): - tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n"; - - tags_counter = 0; - for arch_list in fcountlist[i]: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - - tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n"; - tags_counter = tags_counter + 1; - - tempstring = strip_trailing(tempstring, ",") - tempstring = tempstring + "};\n\n"; - - - for fcount in fcountlist: - tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; - counter += 1; - for arch_list in fcount: - tempstring = tempstring + "#if defined(LV_HAVE_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + ")"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " && defined(LV_HAVE_"; - tempstring = tempstring + "\n" - tempstring = tempstring + " (1 << LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind]; - if ind < len(arch_list) - 1: - tempstring = tempstring + ") + (1 << LV_" - tempstring = tempstring + "),\n#endif\n" - tempstring = strip_trailing(tempstring, ",") - tempstring = tempstring + "};\n\n" - - counter = 0; - for fcount in fcountlist: - tempstring += "static const int " + funclist[counter] + "_n_archs = " - counter += 1; - for arch_list in fcount: - tempstring = tempstring + " (LV_" - for ind in range(len(arch_list)): - tempstring = tempstring + arch_list[ind] + "_CNT"; - if ind < len(arch_list) - 1: - tempstring = tempstring + " * LV_"; - tempstring = tempstring + ") + "; - tempstring = strip_trailing(tempstring, " + "); - tempstring = tempstring + ";\n" - - - tempstring = tempstring + emit_epilog(); - tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n"; - - return tempstring; - diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py deleted file mode 100644 index 5a848e59e..000000000 --- a/volk/include/volk/make_set_simd.py +++ /dev/null @@ -1,166 +0,0 @@ -# -# Copyright 2010 Free Software Foundation, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from xml.dom import minidom - -def make_set_simd(dom, machines) : - tempstring = ""; - tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; - - tempstring = tempstring +'\ndnl define arch checks\n'; - for domarch in dom: - if str(domarch.attributes["type"].value) != "all": - arch = str(domarch.attributes["name"].value); - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n"; - tempstring = tempstring + "[\n"; - tempstring = tempstring + " LF_CHECK_CC_FLAG([-" + flag + "])\n"; - tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; - tempstring = tempstring + "])\n"; - - tempstring = tempstring +'\ndnl main set_simd_flags\n'; - tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; - tempstring = tempstring + "[\n"; - #tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; - tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; - tempstring = tempstring + " indCC=no\n"; - tempstring = tempstring + " indCXX=no\n"; - tempstring = tempstring + " indLV_ARCH=no\n"; - tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n"; - tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list("; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + arch + ", " - tempstring = tempstring[0:len(tempstring) - 2]; - - tempstring = tempstring + ")]),\n"; - tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n"; - tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; - if str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AC_DEFINE(LV_MAKE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; - tempstring = tempstring + " ADDONS=\"\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; - #tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; - tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; - tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; - tempstring = tempstring + " cf_with_lv_arch=\""; - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + arch + " "; - tempstring = tempstring[0:-1] + "\"\n"; - tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; - tempstring = tempstring + " fi\n"; - - tempstring = tempstring +'\ndnl init LV_MAKE_XXX and then try to add archs\n'; - for domarch in dom: - if str(domarch.attributes["type"].value) != "all": - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=no\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - if atype != "all": - tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - atype = str(domarch.attributes["type"].value); - tempstring = tempstring +'\ndnl add in flags for arch ' + arch + '\n'; - overrule = domarch.getElementsByTagName("overrule"); - if overrule: - overrule = str(overrule[0].firstChild.data); - else: - overrule = ""; - overrule_val = domarch.getElementsByTagName("overrule_val"); - if overrule_val: - overrule_val = str(overrule_val[0].firstChild.data); - else: - overrule_val = ""; - flag = domarch.getElementsByTagName("flag"); - flag = str(flag[0].firstChild.data); - if atype != "all": - tempstring = tempstring + " for i in $lf_CXXFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCXX=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $lf_CFLAGS\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; - tempstring = tempstring + " indCC=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - - #tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indCC=no\n" - tempstring = tempstring + " indCXX=no\n" - tempstring = tempstring + " indLV_ARCH=no\n" - else: - tempstring = tempstring + " for i in $cf_with_lv_arch\n" - tempstring = tempstring + " do\n" - tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; - tempstring = tempstring + " indLV_ARCH=yes\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " done\n" - tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " indLV_ARCH=no\n" - tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" - tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " indLV_ARCH=no\n" - - - for domarch in dom: - arch = str(domarch.attributes["name"].value); - tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; - - tempstring += "\n" - #now we can define the machines we're compiling - for machine_name in machines: - tempstring += " AM_CONDITIONAL(LV_MACHINE_" + machine_name.swapcase() + ", " - marchlist = machines[machine_name] - for march in marchlist: - tempstring += "test \"$LV_MAKE_" + march.swapcase() + "\" == \"yes\" && " - - tempstring += "test true)\n" #just so we don't have to detect the last one in the group, i know - tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" - tempstring = tempstring + "])\n" - - return tempstring; - - diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py deleted file mode 100644 index 8f9f2b55e..000000000 --- a/volk/include/volk/make_typedefs.py +++ /dev/null @@ -1,23 +0,0 @@ -from xml.dom import minidom -import string -from volk_regexp import * - - - -def make_typedefs(funclist, retlist, my_argtypelist) : - tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS'; - tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n'; - tempstring = tempstring + '\n\n#include\n'; - tempstring = tempstring + '#include\n'; - - tempstring = tempstring + '\n'; - - for i in range(len(funclist)): - tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n"; - - tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; - - return tempstring; diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py deleted file mode 100644 index b83ce5206..000000000 --- a/volk/include/volk/volk_regexp.py +++ /dev/null @@ -1,14 +0,0 @@ -import re -import string - -remove_after_underscore = re.compile("_.*"); -space_remove = re.compile(" "); -leading_space_remove = re.compile("^ *"); -replace_arch = re.compile(", const char\* arch"); -replace_bracket = re.compile(" {"); -replace_volk = re.compile("volk"); - -def strip_trailing(tostrip, stripstr): - lindex = tostrip.rfind(stripstr) - tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, ""); - return tostrip diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py deleted file mode 100755 index 9b7ca73f9..000000000 --- a/volk/include/volk/volk_register.py +++ /dev/null @@ -1,291 +0,0 @@ -#! /usr/bin/env python - -import sys -import re -import string -from xml.dom import minidom -from volk_regexp import * -from make_cpuid_c import make_cpuid_c -from make_cpuid_h import make_cpuid_h -from make_set_simd import make_set_simd -from make_registry import make_registry -from make_config_fixed import make_config_fixed -from make_typedefs import make_typedefs -from make_environment_init_c import make_environment_init_c -from make_environment_init_h import make_environment_init_h -from make_makefile_am import make_makefile_am -from make_machines_h import make_machines_h -from make_machines_c import make_machines_c -from make_each_machine_c import make_each_machine_c -from make_c import make_c -from make_h import make_h -import copy - -outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); -outfile_reg = open("volk_registry.h", "w"); -outfile_h = open("volk.h", "w"); -outfile_c = open("../../lib/volk.cc", "w"); -outfile_typedefs = open("volk_typedefs.h", "w"); -outfile_init_h = open("../../lib/volk_init.h", "w"); -outfile_cpu_h = open("volk_cpu.h", "w"); -outfile_cpu_c = open("../../lib/volk_cpu.c", "w"); -#outfile_config_in = open("../../volk_config.h.in", "w"); -outfile_config_fixed = open("volk_config_fixed.h", "w"); -#outfile_mktables = open("../../lib/volk_mktables.c", "w"); -outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); -outfile_environment_h = open("volk_environment_init.h", "w"); -outfile_makefile_am = open("../../lib/Makefile.am", "w"); -outfile_machines_h = open("volk_machines.h", "w"); -outfile_machines_c = open("../../lib/volk_machines.cc", "w"); -infile = open("Makefile.am", "r"); - - -mfile = infile.readlines(); - -datatypes = []; -functions = []; - - - -for line in mfile: - subline = re.search(".*_(a16|u)\.h.*", line); - if subline: - subsubline = re.search("(?<=volk_).*", subline.group(0)); - if subsubline: - dtype = remove_after_underscore.sub("", subsubline.group(0)); - subdtype = re.search("[0-9]+[A-z]+", dtype); - if subdtype: - datatypes.append(subdtype.group(0)); - - -datatypes = set(datatypes); - -for line in mfile: - for dt in datatypes: - if dt in line: - subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line); - if subline: - - subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); - -archs = []; -afile = minidom.parse("archs.xml"); -filearchs = afile.getElementsByTagName("arch"); -for filearch in filearchs: - archs.append(str(filearch.attributes["name"].value)); - -for arch in archs: - a_var = re.search("^\$", arch); - if a_var: - archs.remove(arch); - - - -archflags_dict = {} -for filearch in filearchs: - archflags_dict[str(filearch.attributes["name"].value)] = str(filearch.getElementsByTagName("flag")[0].firstChild.data) - -archs_or = "(" -for arch in archs: - archs_or = archs_or + string.upper(arch) + "|"; -archs_or = archs_or[0:len(archs_or)-1]; -archs_or = archs_or + ")"; - -#get machine list and parse to a list of machines, each with a list of archs (none of this DOM crap) -machine_str_dict = {} -mfile = minidom.parse("machines.xml"); -filemachines = mfile.getElementsByTagName("machine") - -for filemachine in filemachines: - machine_str_dict[str(filemachine.attributes["name"].value)] = str(filemachine.getElementsByTagName("archs")[0].firstChild.data).split() - -#all right now you have a dict of arch lists -#next we expand it -#this is an expanded list accounting for the OR syntax -#TODO: make this work for multiple "|" machines -machines = {} -already_done = False -for machine_name in machine_str_dict: - already_done = False - marchlist = machine_str_dict[machine_name] - for march in marchlist: - or_marchs = march.split("|") - if len(or_marchs) > 1: - marchlist.remove(march) - for or_march in or_marchs: - tempmarchlist = copy.deepcopy(marchlist) - tempmarchlist.append(or_march) - machines[machine_name + "_" + or_march] = tempmarchlist - already_done = True - - if not already_done: - machines[machine_name] = marchlist - -#for machine_name in machines: -# print machine_name + ": " + str(machines[machine_name]) - -#ok, now we have all the machines we're going to build. next step is to generate a Makefile.am where they're all laid out and compiled - -taglist = []; -fcountlist = []; -arched_arglist = []; -retlist = []; -my_arglist = []; -my_argtypelist = []; -for func in functions: - tags = []; - fcount = []; - infile_source = open(func + ".h"); - begun_name = 0; - begun_paren = 0; - sourcefile = infile_source.readlines(); - infile_source.close(); - for line in sourcefile: -#FIXME: make it work for multiple #if define()s - archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); - if archline: - arch = archline.group(0); - archline = re.findall(archs_or + "(?=( |\n|&))", line); - if archline: - archsublist = []; - for tup in archline: - archsublist.append(tup[0]); - fcount.append(archsublist); - testline = re.search("static inline.*?" + func, line); - if (not testline): - continue - tagline = re.search(func + "_.+", line); - if tagline: - tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); - if tag: - tag = re.search("\w+", tag.group(0)); - if tag: - tags.append(tag.group(0)); - - - if begun_name == 0: - retline = re.search(".+(?=" + func + ")", line); - if retline: - ret = retline.group(0); - - - - - subline = re.search(func + ".*", line); - if subline: - subsubline = re.search("\(.*?\)", subline.group(0)); - if subsubline: - args = subsubline.group(0); - - else: - begun_name = 1; - subsubline = re.search("\(.*", subline.group(0)); - if subsubline: - args = subsubline.group(0); - begun_paren = 1; - else: - if begun_paren == 1: - subline = re.search(".*?\)", line); - if subline: - args = args + subline.group(0); - begun_name = 0; - begun_paren = 0; - else: - subline = re.search(".*", line); - args = args + subline.group(0); - else: - subline = re.search("\(.*?\)", line); - if subline: - args = subline.group(0); - begun_name = 0; - else: - subline = re.search("\(.*", line); - if subline: - args = subline.group(0); - begun_paren = 1; - - replace = re.compile("static "); - ret = replace.sub("", ret); - replace = re.compile("inline "); - ret = replace.sub("", ret); - replace = re.compile("\)"); - arched_args = replace.sub(", const char* arch) {", args); - - remove = re.compile('\)|\(|{'); - rargs = remove.sub("", args); - sargs = rargs.split(','); - - - - margs = []; - atypes = []; - for arg in sargs: - temp = arg.split(" "); - margs.append(temp[-1]); - replace = re.compile(" " + temp[-1]); - atypes.append(replace.sub("", arg)); - - - my_args = "" - arg_types = "" - for arg in range(0, len(margs) - 1): - this_arg = leading_space_remove.sub("", margs[arg]); - my_args = my_args + this_arg + ", "; - this_type = leading_space_remove.sub("", atypes[arg]); - arg_types = arg_types + this_type + ", "; - - this_arg = leading_space_remove.sub("", margs[-1]); - my_args = my_args + this_arg; - this_type = leading_space_remove.sub("", atypes[-1]); - arg_types = arg_types + this_type; - my_argtypelist.append(arg_types); - - if(ret[-1] != ' '): - ret = ret + ' '; - - arched_arglist.append(arched_args); #!!!!!!!!!!! - my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! - retlist.append(ret); - fcountlist.append(fcount); - taglist.append(tags); - - -outfile_cpu_h.write(make_cpuid_h(filearchs)); -outfile_cpu_h.close(); - -outfile_cpu_c.write(make_cpuid_c(filearchs)); -outfile_cpu_c.close(); - -outfile_set_simd.write(make_set_simd(filearchs, machines)); -outfile_set_simd.close(); - -outfile_reg.write(make_registry(filearchs, functions, fcountlist, taglist)); -outfile_reg.close(); - -outfile_config_fixed.write(make_config_fixed(filearchs)); -outfile_config_fixed.close(); - -outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); -outfile_typedefs.close(); - -outfile_makefile_am.write(make_makefile_am(filearchs, machines, archflags_dict)) -outfile_makefile_am.close() - -outfile_machines_h.write(make_machines_h(functions, machines)) -outfile_machines_h.close() - -outfile_machines_c.write(make_machines_c(machines)) -outfile_machines_c.close() - -outfile_c.write(make_c(machines, archs, functions, arched_arglist, my_arglist)) -outfile_c.close() - -outfile_h.write(make_h(functions, arched_arglist)) -outfile_h.close() - -for machine in machines: - machine_c_filename = "../../lib/volk_machine_" + machine + ".cc" - outfile_machine_c = open(machine_c_filename, "w") - outfile_machine_c.write(make_each_machine_c(machine, machines[machine], functions, fcountlist, taglist)) - outfile_machine_c.close() -- cgit From 12413747c90754482582e16c95b551e1b36c6074 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 11:25:00 -0700 Subject: volk: removed volk_registry.h, it was superseded by the machines --- volk/include/volk/Makefile.am | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index ea60d201a..bb4645526 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -31,7 +31,6 @@ volkinclude_HEADERS = \ volk_common.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ $(top_gendir)/include/volk/volk_typedefs.h \ - $(top_gendir)/include/volk/volk_registry.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ $(top_gendir)/include/volk/volk_machines.h \ -- cgit From 5b4c7d27e9d49ab58df1f1d9350dcaf64c60a1ce Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 12:41:16 -0700 Subject: volk: top-level common header cleanup Since we already have a volk_common.h, moved the attributes, API declaration, and c-linkage macros into volk_common.h This change removes volk_attributes.h, in favor of one common include header. The implementation headers that require attributes now include volk_common.h This change removes the emit_omnilog.py script, in favor of pre-processor macros in volk_common.h In addition, extern C is only defined when in C++ and in GCC because non-GCC does not have complex.h --- volk/include/volk/Makefile.am | 1 - volk/include/volk/volk_16ic_magnitude_16i_a16.h | 2 +- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 2 +- .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 2 +- .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 2 +- volk/include/volk/volk_32f_accumulator_s32f_a16.h | 2 +- volk/include/volk/volk_32f_index_max_16u_a16.h | 2 +- ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 2 +- volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 2 +- .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 2 +- volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 2 +- .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 2 +- .../volk_32fc_s32f_deinterleave_real_16i_a16.h | 2 +- .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 2 +- .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 2 +- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 2 +- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 2 +- .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 2 +- volk/include/volk/volk_attributes.h | 56 ------------------- volk/include/volk/volk_common.h | 62 +++++++++++++++++++--- 23 files changed, 76 insertions(+), 83 deletions(-) delete mode 100644 volk/include/volk/volk_attributes.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index bb4645526..a05f0fbfd 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -26,7 +26,6 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ - volk_attributes.h \ volk_complex.h \ volk_common.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h index 00d29b112..73c6f3390 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H #define INCLUDED_volk_16ic_magnitude_16i_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h index a4f0689e5..e4a9015b4 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h index 564aa1f5d..993445995 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h index 637ba9fd0..a136c0535 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h index 94aff3a49..dd24a1e29 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H #define INCLUDED_volk_32f_accumulator_s32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h index 5c19bfca0..af1f35348 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_index_max_16u_a16_H #define INCLUDED_volk_32f_index_max_16u_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h index 70ab3ccdb..f5b388e6d 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h index 71b53ba3a..4acd2e13e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H #define INCLUDED_volk_32f_s32f_convert_16i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 095d7bd35..2927d616c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H #define INCLUDED_volk_32f_s32f_convert_32i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index 509a46609..c114ea38f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H #define INCLUDED_volk_32f_s32f_convert_8i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h index 779ae2d39..c2b903657 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H #define INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h index 9605322d3..10d72e09d 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h index 93151260f..2cd974070 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H #define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#include +#include #include diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h index cab3db50d..f7ad3fd18 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h index 304515a5c..2460039d2 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h index 96afa5ae9..f67ab0607 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h index 78e28c903..f221237ff 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h index d404ee684..9657c8f6b 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#include +#include #include #include #include diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h index d68d2462a..7d778796e 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h index d2cfa42f6..a2e0cd8de 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -1,7 +1,7 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#include +#include #include #include diff --git a/volk/include/volk/volk_attributes.h b/volk/include/volk/volk_attributes.h deleted file mode 100644 index 5345c1535..000000000 --- a/volk/include/volk/volk_attributes.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2011 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -#ifndef INCLUDED_VOLK_ATTRIBUTES_H -#define INCLUDED_VOLK_ATTRIBUTES_H - -//////////////////////////////////////////////////////////////////////// -// Cross-platform attribute macros -//////////////////////////////////////////////////////////////////////// -#if defined __GNUC__ -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# if __GNUC__ >= 4 -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# else -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# endif -#elif _MSC_VER -# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE __forceinline -# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) -# define __VOLK_ATTR_EXPORT __declspec(dllexport) -# define __VOLK_ATTR_IMPORT __declspec(dllimport) -#else -# define __VOLK_ATTR_ALIGNED(x) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE -# define __VOLK_ATTR_DEPRECATED -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -#endif - -#endif /* INCLUDED_VOLK_ATTRIBUTES_H */ diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 0a47ff864..1e868561e 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -1,6 +1,62 @@ #ifndef INCLUDED_LIBVOLK_COMMON_H #define INCLUDED_LIBVOLK_COMMON_H +//////////////////////////////////////////////////////////////////////// +// Cross-platform attribute macros +//////////////////////////////////////////////////////////////////////// +#if defined __GNUC__ +# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +# define __VOLK_ATTR_UNUSED __attribute__((unused)) +# define __VOLK_ATTR_INLINE __attribute__((always_inline)) +# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# if __GNUC__ >= 4 +# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +# else +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +# endif +#elif _MSC_VER +# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE __forceinline +# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +# define __VOLK_ATTR_EXPORT __declspec(dllexport) +# define __VOLK_ATTR_IMPORT __declspec(dllimport) +#else +# define __VOLK_ATTR_ALIGNED(x) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE +# define __VOLK_ATTR_DEPRECATED +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// C-linkage declaration macros +// FIXME: due to the usage of complex.h, require gcc for c-linkage +//////////////////////////////////////////////////////////////////////// +#if defined(__cplusplus) && (__GNUC__) +# define __VOLK_DECL_BEGIN extern "C" { +# define __VOLK_DECL_END } +#else +# define __VOLK_DECL_BEGIN +# define __VOLK_DECL_END +#endif + +//////////////////////////////////////////////////////////////////////// +// Define VOLK_API for library symbols +// http://gcc.gnu.org/wiki/Visibility +//////////////////////////////////////////////////////////////////////// +#ifdef volk_EXPORTS +# define VOLK_API __VOLK_ATTR_EXPORT +#else +# define VOLK_API __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// The bit128 union used by some +//////////////////////////////////////////////////////////////////////// #include #ifdef LV_HAVE_MMX #include @@ -15,10 +71,4 @@ union bit128{ }; #endif /*LV_HAVE_MMX*/ -struct volk_func_desc { - const char **indices; - const int *arch_defs; - const int n_archs; -}; - #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From c40ef84defaeed0c9ec70e45a7e4019fa6d6e1b2 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 6 May 2011 14:27:48 -0700 Subject: volk: various backports from MSVC building 1) Added support for __cpuid intrinsic under MSVC 2) Fixed disambiguation for std::abs overload in qa code 3) Fixed bit128 union, the ifdefs were completely wrong --- volk/include/volk/volk_common.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 1e868561e..12623073c 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -57,18 +57,30 @@ //////////////////////////////////////////////////////////////////////// // The bit128 union used by some //////////////////////////////////////////////////////////////////////// -#include -#ifdef LV_HAVE_MMX -#include +#include + +#ifdef LV_HAVE_SSE +#include +#endif + +#ifdef LV_HAVE_SSE2 +#include +#endif + union bit128{ uint16_t i16[8]; uint32_t i[4]; float f[4]; double d[2]; - __m128i int_vec; + + #ifdef LV_HAVE_SSE __m128 float_vec; + #endif + + #ifdef LV_HAVE_SSE2 + __m128i int_vec; __m128d double_vec; + #endif }; -#endif /*LV_HAVE_MMX*/ #endif /*INCLUDED_LIBVOLK_COMMON_H*/ -- cgit From ffbe7a0e951808ee972054b2182b97986595a9c5 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 9 May 2011 20:22:46 -0700 Subject: volk: implement type-agnostic operators for volk_complex --- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 18 ++--- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 2 +- .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 8 +-- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 8 +-- volk/include/volk/volk_complex.h | 79 +++++++++++++--------- 5 files changed, 66 insertions(+), 49 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h index 90cc4f5e7..155b93ca2 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -3,6 +3,14 @@ #include #include +#include + +//! raise a complex float to a real float power +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ + const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); + const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); + return mag*lv_cmake(cosf(arg), sinf(arg)); +} #ifdef LV_HAVE_SSE #include @@ -72,11 +80,8 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); } } #endif /* LV_HAVE_SSE */ @@ -93,12 +98,9 @@ static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, con lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 73576a766..6b22d9f81 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -131,7 +131,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result Isum += Im; } - result[0] = lv_32fc_init(Rsum,Isum); + result[0] = lv_cmake(Rsum,Isum); return; } diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h index 9e8982e9b..7307ae484 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -55,10 +55,10 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe for(; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *c16Ptr++ = (int16_t)lv_creal(temp); @@ -83,10 +83,10 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cV for(number =0; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *c16Ptr++ = (int16_t)lv_creal(temp); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h index fa58ff058..adc7c0599 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -75,10 +75,10 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t for(; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *cFloatPtr++ = lv_creal(temp) / scalar; @@ -104,10 +104,10 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_ for(number = 0; number < num_points; number++){ float aReal = (float)*a8Ptr++; float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + lv_32fc_t aVal = lv_cmake(aReal, aImag ); float bReal = (float)*b8Ptr++; float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; *cPtr++ = (lv_creal(temp) * invScalar); diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h index b20b5cf47..5bd925044 100644 --- a/volk/include/volk/volk_complex.h +++ b/volk/include/volk/volk_complex.h @@ -2,8 +2,21 @@ #define INCLUDE_VOLK_COMPLEX_H /*! - \brief This header file is to prevent issues with having and variables in the same code as the gcc compiler does not allow that -*/ + * \brief Provide typedefs and operators for all complex types in C and C++. + * + * The typedefs encompass all signed integer and floating point types. + * Each operator function is intended to work across all data types. + * Under C++, these operators are defined as inline templates. + * Under C, these operators are defined as preprocessor macros. + * The use of macros makes the operators agnostic to the type. + * + * The following operator functions are defined: + * - lv_cmake - make a complex type from components + * - lv_creal - get the real part of the complex number + * - lv_cimag - get the imaginary part of the complex number + * - lv_conj - take the conjugate of the complex number + */ + #ifdef __cplusplus #include @@ -12,60 +25,62 @@ typedef std::complex lv_8sc_t; typedef std::complex lv_16sc_t; typedef std::complex lv_32sc_t; +typedef std::complex lv_64sc_t; typedef std::complex lv_32fc_t; typedef std::complex lv_64fc_t; -static inline float lv_creal(const lv_32fc_t x){ - return x.real(); -} - -static inline float lv_cimag(const lv_32fc_t x){ - return x.imag(); +template inline std::complex lv_cmake(const T &r, const T &i){ + return std::complex(r, i); } -static inline lv_32fc_t lv_conj(const lv_32fc_t x){ - return std::conj(x); +template inline typename T::value_type lv_creal(const T &x){ + return x.real(); } -static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ - return std::pow(x, y); +template inline typename T::value_type lv_cimag(const T &x){ + return x.imag(); } -static inline lv_32fc_t lv_32fc_init(const float x, const float y){ - return std::complex(x,y); +template inline T lv_conj(const T &x){ + return std::conj(x); } -#else +#else /* __cplusplus */ #include typedef char complex lv_8sc_t; typedef short complex lv_16sc_t; -typedef int complex lv_32sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; typedef float complex lv_32fc_t; typedef double complex lv_64fc_t; -static inline float lv_creal(const lv_32fc_t x){ - return creal(x); -} +#define lv_cmake(r, i) ((r) + _Complex_I*(i)) -static inline float lv_cimag(const lv_32fc_t x){ - return cimag(x); -} +// When GNUC is available, use the complex extensions. +// The extensions always return the correct value type. +// http://gcc.gnu.org/onlinedocs/gcc/Complex.html +#ifdef __GNUC__ -static inline lv_32fc_t lv_conj(const lv_32fc_t x){ - return conj(x); -} +#define lv_creal(x) (__real__(x)) -static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ - return cpow(x, y); -} +#define lv_cimag(x) (__imag__(x)) -static inline lv_32fc_t lv_32fc_init(const float x, const float y){ - return x + I*y; -} +#define lv_conj(x) (~(x)) + +// When not available, use the c99 complex function family, +// which always returns double regardless of the input type. +#else /* __GNUC__ */ + +#define lv_creal(x) (creal(x)) + +#define lv_cimag(x) (cimag(x)) + +#define lv_conj(x) (conj(x)) -#endif +#endif /* __GNUC__ */ +#endif /* __cplusplus */ #endif /* INCLUDE_VOLK_COMPLEX_H */ -- cgit From 21d6870a6ef5284a5941dce1484bcfff6684ffea Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 9 May 2011 20:31:56 -0700 Subject: volk: cmake support for volk (gcc + msvc) --- volk/include/volk/volk_common.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 12623073c..2c935d1fb 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -32,6 +32,14 @@ # define __VOLK_ATTR_IMPORT #endif +//////////////////////////////////////////////////////////////////////// +// Ignore annoying warnings in MSVC +//////////////////////////////////////////////////////////////////////// +#if defined(_MSC_VER) +# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data +# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#endif + //////////////////////////////////////////////////////////////////////// // C-linkage declaration macros // FIXME: due to the usage of complex.h, require gcc for c-linkage -- cgit From a1b9b5c16c53bedfe8ebab39055a36dee387a9a4 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 15:07:31 -0700 Subject: Volk: forgot to add prefs.c/h to git... --- volk/include/volk/volk_prefs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 volk/include/volk/volk_prefs.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h new file mode 100644 index 000000000..77bcb709b --- /dev/null +++ b/volk/include/volk/volk_prefs.h @@ -0,0 +1,19 @@ +#ifndef INCLUDED_VOLK_PREFS_H +#define INCLUDED_VOLK_PREFS_H + +struct volk_arch_pref { + char name[128]; + char arch[32]; +}; + +//////////////////////////////////////////////////////////////////////// +// get path to volk_config profiling info +//////////////////////////////////////////////////////////////////////// +void get_config_path(char *); + +//////////////////////////////////////////////////////////////////////// +// load prefs into global prefs struct +//////////////////////////////////////////////////////////////////////// +int load_preferences(struct volk_arch_pref **); + +#endif //INCLUDED_VOLK_PREFS_H -- cgit From 88d41e8bc991476880d40d2369018e9adf0011ff Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 16:17:46 -0700 Subject: Volk: added an AVX impl (of 32f multiply) just to see if it's any faster. It's not. --- volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h index cef17f5a6..885941abf 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -43,6 +43,45 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector -- cgit From 7d349848c60f99f9906cb57d0ebe3c7dd35096bd Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 12 May 2011 17:39:11 -0700 Subject: Volk: avx impl for 32f_s32f_convert_32i --- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h index 2927d616c..3f5044313 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -5,6 +5,42 @@ #include #include +#ifdef LV_HAVE_AVX +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include /*! -- cgit From 9bfe75fd7c6a7069db2d2a98195faabf6ba248e2 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 13 May 2011 13:58:01 -0700 Subject: volk: do not install library-only headers --- volk/include/volk/Makefile.am | 3 +-- volk/include/volk/volk_prefs.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index a05f0fbfd..5f9e134bc 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -28,12 +28,11 @@ volkincludedir = $(prefix)/include/volk volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ + volk_prefs.h \ $(top_gendir)/include/volk/volk_config_fixed.h \ $(top_gendir)/include/volk/volk_typedefs.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ - $(top_gendir)/include/volk/volk_machines.h \ - $(top_gendir)/include/volk/volk_environment_init.h \ volk_16i_x5_add_quad_16i_x4_a16.h \ volk_16i_branch_4_state_8_a16.h \ volk_16ic_deinterleave_16i_x2_a16.h \ diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 77bcb709b..2a7f7e79f 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -1,7 +1,11 @@ #ifndef INCLUDED_VOLK_PREFS_H #define INCLUDED_VOLK_PREFS_H -struct volk_arch_pref { +#include + +__VOLK_DECL_BEGIN + +struct VOLK_API volk_arch_pref { char name[128]; char arch[32]; }; @@ -9,11 +13,13 @@ struct volk_arch_pref { //////////////////////////////////////////////////////////////////////// // get path to volk_config profiling info //////////////////////////////////////////////////////////////////////// -void get_config_path(char *); +VOLK_API void get_config_path(char *); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -int load_preferences(struct volk_arch_pref **); +VOLK_API int load_preferences(struct volk_arch_pref **); + +__VOLK_DECL_END #endif //INCLUDED_VOLK_PREFS_H -- cgit From 9d2ee4d74d312fd788f02a85b1d24ac16e4eb4eb Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:19:48 -0700 Subject: Volk: rename aligned functions to just _a instead of _a16 --- volk/include/volk/volk_16i_branch_4_state_8_a.h | 194 +++++++++ volk/include/volk/volk_16i_branch_4_state_8_a16.h | 194 --------- volk/include/volk/volk_16i_convert_8i_a.h | 69 +++ volk/include/volk/volk_16i_convert_8i_a16.h | 69 --- volk/include/volk/volk_16i_max_star_16i_a.h | 108 +++++ volk/include/volk/volk_16i_max_star_16i_a16.h | 108 ----- .../volk/volk_16i_max_star_horizontal_16i_a.h | 130 ++++++ .../volk/volk_16i_max_star_horizontal_16i_a16.h | 130 ------ .../volk/volk_16i_permute_and_scalar_add_a.h | 139 ++++++ .../volk/volk_16i_permute_and_scalar_add_a16.h | 139 ------ volk/include/volk/volk_16i_s32f_convert_32f_a.h | 119 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 119 ------ .../include/volk/volk_16i_x4_quad_max_star_16i_a.h | 191 +++++++++ .../volk/volk_16i_x4_quad_max_star_16i_a16.h | 191 --------- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h | 136 ++++++ .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ------ .../include/volk/volk_16ic_deinterleave_16i_x2_a.h | 158 +++++++ .../volk/volk_16ic_deinterleave_16i_x2_a16.h | 158 ------- .../volk/volk_16ic_deinterleave_real_16i_a.h | 120 ++++++ .../volk/volk_16ic_deinterleave_real_16i_a16.h | 120 ------ .../volk/volk_16ic_deinterleave_real_8i_a.h | 94 +++++ .../volk/volk_16ic_deinterleave_real_8i_a16.h | 94 ----- volk/include/volk/volk_16ic_magnitude_16i_a.h | 191 +++++++++ volk/include/volk/volk_16ic_magnitude_16i_a16.h | 191 --------- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a.h | 109 +++++ .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 109 ----- .../volk/volk_16ic_s32f_deinterleave_real_32f_a.h | 126 ++++++ .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 126 ------ volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h | 180 ++++++++ .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 180 -------- volk/include/volk/volk_16u_byteswap_a.h | 77 ++++ volk/include/volk/volk_16u_byteswap_a16.h | 77 ---- volk/include/volk/volk_32f_accumulator_s32f_a.h | 68 +++ volk/include/volk/volk_32f_accumulator_s32f_a16.h | 68 --- volk/include/volk/volk_32f_convert_64f_a.h | 70 +++ volk/include/volk/volk_32f_convert_64f_a16.h | 70 --- volk/include/volk/volk_32f_index_max_16u_a.h | 149 +++++++ volk/include/volk/volk_32f_index_max_16u_a16.h | 149 ------- .../volk/volk_32f_s32f_32f_fm_detect_32f_a.h | 120 ++++++ .../volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 120 ------ ...volk_32f_s32f_calc_spectral_noise_floor_32f_a.h | 168 ++++++++ ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 168 -------- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 111 +++++ volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 111 ----- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 143 +++++++ volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 143 ------- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 118 ++++++ volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 118 ------ volk/include/volk/volk_32f_s32f_normalize_a.h | 81 ++++ volk/include/volk/volk_32f_s32f_normalize_a16.h | 81 ---- volk/include/volk/volk_32f_s32f_power_32f_a.h | 144 +++++++ volk/include/volk/volk_32f_s32f_power_32f_a16.h | 144 ------- volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 145 +++++++ volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 145 ------- volk/include/volk/volk_32f_sqrt_32f_a.h | 77 ++++ volk/include/volk/volk_32f_sqrt_32f_a16.h | 77 ---- .../volk/volk_32f_stddev_and_mean_32f_x2_a.h | 170 ++++++++ .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 170 -------- volk/include/volk/volk_32f_x2_add_32f_a.h | 81 ++++ volk/include/volk/volk_32f_x2_add_32f_a16.h | 81 ---- volk/include/volk/volk_32f_x2_divide_32f_a.h | 82 ++++ volk/include/volk/volk_32f_x2_divide_32f_a16.h | 82 ---- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 185 ++++++++ volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 185 -------- volk/include/volk/volk_32f_x2_interleave_32fc_a.h | 75 ++++ .../include/volk/volk_32f_x2_interleave_32fc_a16.h | 75 ---- volk/include/volk/volk_32f_x2_max_32f_a.h | 85 ++++ volk/include/volk/volk_32f_x2_max_32f_a16.h | 85 ---- volk/include/volk/volk_32f_x2_min_32f_a.h | 85 ++++ volk/include/volk/volk_32f_x2_min_32f_a16.h | 85 ---- volk/include/volk/volk_32f_x2_multiply_32f_a.h | 120 ++++++ volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 120 ------ .../volk/volk_32f_x2_s32f_interleave_16ic_a.h | 156 +++++++ .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 156 ------- volk/include/volk/volk_32f_x2_subtract_32f_a.h | 81 ++++ volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 81 ---- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 151 +++++++ .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 ------- volk/include/volk/volk_32fc_32f_multiply_32fc_a.h | 95 +++++ .../include/volk/volk_32fc_32f_multiply_32fc_a16.h | 95 ----- .../include/volk/volk_32fc_deinterleave_32f_x2_a.h | 75 ++++ .../volk/volk_32fc_deinterleave_32f_x2_a16.h | 75 ---- .../include/volk/volk_32fc_deinterleave_64f_x2_a.h | 78 ++++ .../volk/volk_32fc_deinterleave_64f_x2_a16.h | 78 ---- .../volk/volk_32fc_deinterleave_real_32f_a.h | 68 +++ .../volk/volk_32fc_deinterleave_real_32f_a16.h | 68 --- .../volk/volk_32fc_deinterleave_real_64f_a.h | 66 +++ .../volk/volk_32fc_deinterleave_real_64f_a16.h | 66 --- volk/include/volk/volk_32fc_index_max_16u_a.h | 215 ++++++++++ volk/include/volk/volk_32fc_index_max_16u_a16.h | 215 ---------- volk/include/volk/volk_32fc_magnitude_32f_a.h | 132 ++++++ volk/include/volk/volk_32fc_magnitude_32f_a16.h | 132 ------ volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 158 +++++++ volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 158 ------- .../volk/volk_32fc_s32f_deinterleave_real_16i_a.h | 81 ++++ .../volk_32fc_s32f_deinterleave_real_16i_a16.h | 81 ---- volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h | 159 +++++++ .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 159 ------- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 111 +++++ volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 111 ----- .../volk/volk_32fc_s32f_power_spectrum_32f_a.h | 126 ++++++ .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 126 ------ ...olk_32fc_s32f_x2_power_spectral_density_32f_a.h | 134 ++++++ ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ------ .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 345 +++++++++++++++ .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 345 --------------- volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 469 +++++++++++++++++++++ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 469 --------------------- volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 94 +++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 94 ----- ...lk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 126 ++++++ ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------ volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 112 +++++ .../volk/volk_32fc_x2_square_dist_32f_a16.h | 112 ----- volk/include/volk/volk_32i_s32f_convert_32f_a.h | 73 ++++ volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 73 ---- volk/include/volk/volk_32i_x2_and_32i_a.h | 81 ++++ volk/include/volk/volk_32i_x2_and_32i_a16.h | 81 ---- volk/include/volk/volk_32i_x2_or_32i_a.h | 81 ++++ volk/include/volk/volk_32i_x2_or_32i_a16.h | 81 ---- volk/include/volk/volk_32u_byteswap_a.h | 77 ++++ volk/include/volk/volk_32u_byteswap_a16.h | 77 ---- volk/include/volk/volk_32u_popcnt_a.h | 36 ++ volk/include/volk/volk_32u_popcnt_a16.h | 36 -- volk/include/volk/volk_64f_convert_32f_a.h | 67 +++ volk/include/volk/volk_64f_convert_32f_a16.h | 67 --- volk/include/volk/volk_64f_x2_max_64f_a.h | 71 ++++ volk/include/volk/volk_64f_x2_max_64f_a16.h | 71 ---- volk/include/volk/volk_64f_x2_min_64f_a.h | 71 ++++ volk/include/volk/volk_64f_x2_min_64f_a16.h | 71 ---- volk/include/volk/volk_64u_byteswap_a.h | 88 ++++ volk/include/volk/volk_64u_byteswap_a16.h | 88 ---- volk/include/volk/volk_64u_popcnt_a.h | 50 +++ volk/include/volk/volk_64u_popcnt_a16.h | 50 --- volk/include/volk/volk_8i_convert_16i_a.h | 83 ++++ volk/include/volk/volk_8i_convert_16i_a16.h | 83 ---- volk/include/volk/volk_8i_s32f_convert_32f_a.h | 106 +++++ volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 106 ----- volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h | 77 ++++ .../volk/volk_8ic_deinterleave_16i_x2_a16.h | 77 ---- .../volk/volk_8ic_deinterleave_real_16i_a.h | 66 +++ .../volk/volk_8ic_deinterleave_real_16i_a16.h | 66 --- .../include/volk/volk_8ic_deinterleave_real_8i_a.h | 67 +++ .../volk/volk_8ic_deinterleave_real_8i_a16.h | 67 --- .../volk/volk_8ic_s32f_deinterleave_32f_x2_a.h | 165 ++++++++ .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 165 -------- .../volk/volk_8ic_s32f_deinterleave_real_32f_a.h | 134 ++++++ .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 134 ------ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 101 +++++ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 101 ----- .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 122 ++++++ .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ------ 152 files changed, 9066 insertions(+), 9066 deletions(-) create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a.h delete mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_a.h delete mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_16i_a.h delete mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a.h delete mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a.h delete mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h delete mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h delete mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a.h delete mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a.h delete mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h delete mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_16u_byteswap_a.h delete mode 100644 volk/include/volk/volk_16u_byteswap_a16.h create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a.h delete mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_a.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h create mode 100644 volk/include/volk/volk_32f_index_max_16u_a.h delete mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a.h delete mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a.h delete mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a.h delete mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h delete mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h delete mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a.h delete mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a.h delete mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a.h delete mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a.h delete mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h create mode 100644 volk/include/volk/volk_32u_byteswap_a.h delete mode 100644 volk/include/volk/volk_32u_byteswap_a16.h create mode 100644 volk/include/volk/volk_32u_popcnt_a.h delete mode 100644 volk/include/volk/volk_32u_popcnt_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a.h delete mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a.h delete mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h create mode 100644 volk/include/volk/volk_64u_byteswap_a.h delete mode 100644 volk/include/volk/volk_64u_byteswap_a16.h create mode 100644 volk/include/volk/volk_64u_popcnt_a.h delete mode 100644 volk/include/volk/volk_64u_popcnt_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_a.h delete mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a.h delete mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a.h delete mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h delete mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h delete mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h delete mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h delete mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h new file mode 100644 index 000000000..5eb03b346 --- /dev/null +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H +#define INCLUDED_volk_16i_branch_4_state_8_a16_H + + +#include +#include + + + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h deleted file mode 100644 index 5eb03b346..000000000 --- a/volk/include/volk/volk_16i_branch_4_state_8_a16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H -#define INCLUDED_volk_16i_branch_4_state_8_a16_H - - -#include -#include - - - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h new file mode 100644 index 000000000..4d51e5903 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16i_convert_8i_a16_H +#define INCLUDED_volk_16i_convert_8i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h deleted file mode 100644 index 4d51e5903..000000000 --- a/volk/include/volk/volk_16i_convert_8i_a16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a16_H -#define INCLUDED_volk_16i_convert_8i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h new file mode 100644 index 000000000..063444279 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16i_max_star_16i_a16_H +#define INCLUDED_volk_16i_max_star_16i_a16_H + + +#include +#include + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h deleted file mode 100644 index 063444279..000000000 --- a/volk/include/volk/volk_16i_max_star_16i_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16i_max_star_16i_a16_H -#define INCLUDED_volk_16i_max_star_16i_a16_H - - -#include -#include - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#ifdef LV_HAVE_GENERIC - -static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h new file mode 100644 index 000000000..ece6adb40 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H + + +#include +#include + + +#ifdef LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h deleted file mode 100644 index ece6adb40..000000000 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H -#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H - - -#include -#include - - -#ifdef LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h new file mode 100644 index 000000000..ae1a18157 --- /dev/null +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#ifdef LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h deleted file mode 100644 index ae1a18157..000000000 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H - - -#include -#include - - - - -#ifdef LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h new file mode 100644 index 000000000..09bc252f0 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H +#define INCLUDED_volk_16i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h deleted file mode 100644 index 09bc252f0..000000000 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H -#define INCLUDED_volk_16i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h new file mode 100644 index 000000000..94e5eb986 --- /dev/null +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H + + +#include +#include + + + + + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + + "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#ifdef LV_HAVE_GENERIC +static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h deleted file mode 100644 index 94e5eb986..000000000 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H -#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H - - -#include -#include - - - - - -#ifdef LV_HAVE_SSE2 - -#include - -static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" - - "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h new file mode 100644 index 000000000..c157bf64a --- /dev/null +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H + + +#include +#include + + + + + +#ifdef LV_HAVE_SSE2 +#include +#include + +static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h deleted file mode 100644 index c157bf64a..000000000 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H -#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H - - -#include -#include - - - - - -#ifdef LV_HAVE_SSE2 -#include -#include - -static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h new file mode 100644 index 000000000..227a92303 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h deleted file mode 100644 index 227a92303..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h new file mode 100644 index 000000000..35d0e8be2 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h deleted file mode 100644 index 35d0e8be2..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h new file mode 100644 index 000000000..bdf5fc162 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h deleted file mode 100644 index bdf5fc162..000000000 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h new file mode 100644 index 000000000..73c6f3390 --- /dev/null +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H +#define INCLUDED_volk_16ic_magnitude_16i_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h deleted file mode 100644 index 73c6f3390..000000000 --- a/volk/include/volk/volk_16ic_magnitude_16i_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H -#define INCLUDED_volk_16ic_magnitude_16i_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..e4a9015b4 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h deleted file mode 100644 index e4a9015b4..000000000 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h new file mode 100644 index 000000000..993445995 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 993445995..000000000 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h new file mode 100644 index 000000000..a136c0535 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -0,0 +1,180 @@ +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h deleted file mode 100644 index a136c0535..000000000 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h new file mode 100644 index 000000000..f393c05c5 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_16u_byteswap_a16_H +#define INCLUDED_volk_16u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h deleted file mode 100644 index f393c05c5..000000000 --- a/volk/include/volk/volk_16u_byteswap_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_16u_byteswap_a16_H -#define INCLUDED_volk_16u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h new file mode 100644 index 000000000..dd24a1e29 --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H +#define INCLUDED_volk_32f_accumulator_s32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h deleted file mode 100644 index dd24a1e29..000000000 --- a/volk/include/volk/volk_32f_accumulator_s32f_a16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H -#define INCLUDED_volk_32f_accumulator_s32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h new file mode 100644 index 000000000..8ca83220b --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_a16_H +#define INCLUDED_volk_32f_convert_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h deleted file mode 100644 index 8ca83220b..000000000 --- a/volk/include/volk/volk_32f_convert_64f_a16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a16_H -#define INCLUDED_volk_32f_convert_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h new file mode 100644 index 000000000..af1f35348 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -0,0 +1,149 @@ +#ifndef INCLUDED_volk_32f_index_max_16u_a16_H +#define INCLUDED_volk_32f_index_max_16u_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h deleted file mode 100644 index af1f35348..000000000 --- a/volk/include/volk/volk_32f_index_max_16u_a16.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef INCLUDED_volk_32f_index_max_16u_a16_H -#define INCLUDED_volk_32f_index_max_16u_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_SSE -#include - -static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - float max = src0[0]; - unsigned int index = 0; - - int i = 1; - - for(; i < num_points; ++i) { - - if(src0[i] > max){ - index = i; - max = src0[i]; - } - - } - target[0] = index; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h new file mode 100644 index 000000000..6efd21a37 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h deleted file mode 100644 index 6efd21a37..000000000 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H -#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_noints The number of real values in the input vector. -*/ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int quarterPoints = (num_points-1) / 4; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m128 upperBound = _mm_set_ps1(bound); - __m128 lowerBound = _mm_set_ps1(-bound); - __m128 next3old1; - __m128 next4; - __m128 boundAdjust; - __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. - __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. - // Do the first 4 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < quarterPoints; number++) { - // Load data - next3old1 = _mm_loadu_ps((float*) (inPtr-1)); - next4 = _mm_load_ps(inPtr); - inPtr += 4; - // Subtract and store: - next3old1 = _mm_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); - boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm_cmplt_ps(next3old1, lowerBound); - next4 = _mm_and_ps(next4, negBoundAdjust); - boundAdjust = _mm_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm_add_ps(next3old1, boundAdjust); - _mm_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 4; - } - - for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_points The number of real values in the input vector. -*/ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 0; - float* outPtr = outputVector; - const float* inPtr = inputVector; - - // Do the first 1 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - - for (number = 1; number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h new file mode 100644 index 000000000..f5b388e6d --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -0,0 +1,168 @@ +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h deleted file mode 100644 index f5b388e6d..000000000 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h new file mode 100644 index 000000000..4acd2e13e --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H +#define INCLUDED_volk_32f_s32f_convert_16i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h deleted file mode 100644 index 4acd2e13e..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H -#define INCLUDED_volk_32f_s32f_convert_16i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h new file mode 100644 index 000000000..3f5044313 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -0,0 +1,143 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H +#define INCLUDED_volk_32f_s32f_convert_32i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h deleted file mode 100644 index 3f5044313..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H -#define INCLUDED_volk_32f_s32f_convert_32i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_AVX -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1; - __m256i intInputVal1; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h new file mode 100644 index 000000000..c114ea38f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H +#define INCLUDED_volk_32f_s32f_convert_8i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h deleted file mode 100644 index c114ea38f..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H -#define INCLUDED_volk_32f_s32f_convert_8i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/include/volk/volk_32f_s32f_normalize_a.h new file mode 100644 index 000000000..e6195cd32 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_normalize_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H +#define INCLUDED_volk_32f_s32f_normalize_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h deleted file mode 100644 index e6195cd32..000000000 --- a/volk/include/volk/volk_32f_s32f_normalize_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H -#define INCLUDED_volk_32f_s32f_normalize_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) - \param vecBuffer The buffer of values to be vectorized - \param num_points The number of values in vecBuffer - \param scalar The scale value to be applied to each buffer value -*/ -static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - - const float invScalar = 1.0 / scalar; - __m128 vecScalar = _mm_set_ps1(invScalar); - - __m128 input1; - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - - input1 = _mm_load_ps(inputPtr); - - input1 = _mm_mul_ps(input1, vecScalar); - - _mm_store_ps(inputPtr, input1); - - inputPtr += 4; - } - - number = quarterPoints*4; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h new file mode 100644 index 000000000..ecff901e1 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H +#define INCLUDED_volk_32f_s32f_power_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h deleted file mode 100644 index ecff901e1..000000000 --- a/volk/include/volk/volk_32f_s32f_power_32f_a16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H -#define INCLUDED_volk_32f_s32f_power_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h new file mode 100644 index 000000000..c2b903657 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -0,0 +1,145 @@ +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h deleted file mode 100644 index c2b903657..000000000 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h new file mode 100644 index 000000000..a9ce76f88 --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H +#define INCLUDED_volk_32f_sqrt_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h deleted file mode 100644 index a9ce76f88..000000000 --- a/volk/include/volk/volk_32f_sqrt_32f_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H -#define INCLUDED_volk_32f_sqrt_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); -} - -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h new file mode 100644 index 000000000..10d72e09d --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -0,0 +1,170 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h deleted file mode 100644 index 10d72e09d..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h new file mode 100644 index 000000000..2de6a6644 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H +#define INCLUDED_volk_32f_x2_add_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h deleted file mode 100644 index 2de6a6644..000000000 --- a/volk/include/volk/volk_32f_x2_add_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H -#define INCLUDED_volk_32f_x2_add_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h new file mode 100644 index 000000000..1603e78de --- /dev/null +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H +#define INCLUDED_volk_32f_x2_divide_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h deleted file mode 100644 index 1603e78de..000000000 --- a/volk/include/volk/volk_32f_x2_divide_32f_a16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H -#define INCLUDED_volk_32f_x2_divide_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h new file mode 100644 index 000000000..2cd974070 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -0,0 +1,185 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h deleted file mode 100644 index 2cd974070..000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include - -static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h new file mode 100644 index 000000000..f3731fa2a --- /dev/null +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h deleted file mode 100644 index f3731fa2a..000000000 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H -#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h new file mode 100644 index 000000000..60be6e36d --- /dev/null +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H +#define INCLUDED_volk_32f_x2_max_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h deleted file mode 100644 index 60be6e36d..000000000 --- a/volk/include/volk/volk_32f_x2_max_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H -#define INCLUDED_volk_32f_x2_max_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h new file mode 100644 index 000000000..3b8291531 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H +#define INCLUDED_volk_32f_x2_min_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h deleted file mode 100644 index 3b8291531..000000000 --- a/volk/include/volk/volk_32f_x2_min_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H -#define INCLUDED_volk_32f_x2_min_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h new file mode 100644 index 000000000..885941abf --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H +#define INCLUDED_volk_32f_x2_multiply_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h deleted file mode 100644 index 885941abf..000000000 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H -#define INCLUDED_volk_32f_x2_multiply_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include -/*! - \brief Multiplies the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h new file mode 100644 index 000000000..f7ad3fd18 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -0,0 +1,156 @@ +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h deleted file mode 100644 index f7ad3fd18..000000000 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h new file mode 100644 index 000000000..c01f2c1f3 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H +#define INCLUDED_volk_32f_x2_subtract_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h deleted file mode 100644 index c01f2c1f3..000000000 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H -#define INCLUDED_volk_32f_x2_subtract_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h new file mode 100644 index 000000000..6e446cbef --- /dev/null +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h deleted file mode 100644 index 6e446cbef..000000000 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC - -static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h new file mode 100644 index 000000000..846315a4a --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h deleted file mode 100644 index 846315a4a..000000000 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies the input complex vector with the input float vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the float values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; - for(;number < quarterPoints; number++){ - - aVal1 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - bVal = _mm_load_ps(bPtr); - bPtr += 4; - - bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); - bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - - cVal = _mm_mul_ps(aVal1, bVal1); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - cPtr += 2; - - cVal = _mm_mul_ps(aVal2, bVal2); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr); - bPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - -#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..3e7c3fa28 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h deleted file mode 100644 index 3e7c3fa28..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h new file mode 100644 index 000000000..945a26742 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h deleted file mode 100644 index 945a26742..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h new file mode 100644 index 000000000..3c3fb2583 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h deleted file mode 100644 index 3c3fb2583..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; - - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h new file mode 100644 index 000000000..40c1a7a46 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h deleted file mode 100644 index 40c1a7a46..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - iBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h new file mode 100644 index 000000000..0ad1edbe9 --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H +#define INCLUDED_volk_32fc_index_max_16u_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h deleted file mode 100644 index 0ad1edbe9..000000000 --- a/volk/include/volk/volk_32fc_index_max_16u_a16.h +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H -#define INCLUDED_volk_32fc_index_max_16u_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - - -static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; - - - - - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); - - - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); -; - - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - - for(; i < bound; ++i) { - - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); - - - src0 += 4; - - - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); - - - xmm1 = _mm_hadd_ps(xmm1, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - - } - - - for(i = 0; i < leftovers0; ++i) { - - - xmm2 = _mm_load_ps((float*)src0); - - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; - - xmm2 = _mm_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm_hadd_ps(xmm2, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - } - - - - - for(i = 0; i < leftovers1; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - xmm2 = _mm_load1_ps(&sq_dist); - - xmm1 = xmm3; - - xmm3 = _mm_max_ss(xmm3, xmm2); - - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - } - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - unsigned int l0 = g0 ^ 1; - - unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - unsigned int l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ - -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - float sq_dist = 0.0; - float max = 0.0; - unsigned int index = 0; - - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - - - } - target[0] = index; - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h new file mode 100644 index 000000000..946190e41 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -0,0 +1,132 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H +#define INCLUDED_volk_32fc_magnitude_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h deleted file mode 100644 index 946190e41..000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_a16.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H -#define INCLUDED_volk_32fc_magnitude_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h new file mode 100644 index 000000000..55b1b6c70 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h deleted file mode 100644 index 55b1b6c70..000000000 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H -#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 mask; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_and_ps(phase, keepMask); - mask = _mm_andnot_ps(keepMask, correctVector); - phase = _mm_or_ps(phase, mask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The vector where the results will be stored. - \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ - float* outPtr = outputVector; - const float* inPtr = (float*)inputVector; - const float invNormalizeFactor = 1.0 / normalizeFactor; - unsigned int number; - for ( number = 0; number < num_points; number++) { - const float real = *inPtr++; - const float imag = *inPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h new file mode 100644 index 000000000..2460039d2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h deleted file mode 100644 index 2460039d2..000000000 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h new file mode 100644 index 000000000..f67ab0607 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h @@ -0,0 +1,159 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h deleted file mode 100644 index f67ab0607..000000000 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h new file mode 100644 index 000000000..155b93ca2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H + +#include +#include +#include + +//! raise a complex float to a real float power +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ + const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); + const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); + return mag*lv_cmake(cosf(arg), sinf(arg)); +} + +#ifdef LV_HAVE_SSE +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h deleted file mode 100644 index 155b93ca2..000000000 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H - -#include -#include -#include - -//! raise a complex float to a real float power -static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ - const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); - const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); - return mag*lv_cmake(cosf(arg), sinf(arg)); -} - -#ifdef LV_HAVE_SSE -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h new file mode 100644 index 000000000..03da069c2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h deleted file mode 100644 index 03da069c2..000000000 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H -#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iNormalizationFactor = 1.0 / normalizationFactor; -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided agains all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - const float iNormalizationFactor = 1.0 / normalizationFactor; - unsigned int point; - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h new file mode 100644 index 000000000..5bcd7f7c4 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + +#ifdef LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h deleted file mode 100644 index 5bcd7f7c4..000000000 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - -#ifdef LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h new file mode 100644 index 000000000..f221237ff --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -0,0 +1,345 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H + +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h deleted file mode 100644 index f221237ff..000000000 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h +++ /dev/null @@ -1,345 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H - -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h new file mode 100644 index 000000000..9657c8f6b --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -0,0 +1,469 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h deleted file mode 100644 index 9657c8f6b..000000000 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h +++ /dev/null @@ -1,469 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H - -#include -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h new file mode 100644 index 000000000..72010b855 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h deleted file mode 100644 index 72010b855..000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h new file mode 100644 index 000000000..910f51679 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h deleted file mode 100644 index 910f51679..000000000 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h new file mode 100644 index 000000000..551f3cb53 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h deleted file mode 100644 index 551f3cb53..000000000 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h new file mode 100644 index 000000000..b744c7197 --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H +#define INCLUDED_volk_32i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h deleted file mode 100644 index b744c7197..000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H -#define INCLUDED_volk_32i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h new file mode 100644 index 000000000..4d50efd32 --- /dev/null +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H +#define INCLUDED_volk_32i_x2_and_32i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h deleted file mode 100644 index 4d50efd32..000000000 --- a/volk/include/volk/volk_32i_x2_and_32i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H -#define INCLUDED_volk_32i_x2_and_32i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h new file mode 100644 index 000000000..9edbdbafd --- /dev/null +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H +#define INCLUDED_volk_32i_x2_or_32i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h deleted file mode 100644 index 9edbdbafd..000000000 --- a/volk/include/volk/volk_32i_x2_or_32i_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H -#define INCLUDED_volk_32i_x2_or_32i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h new file mode 100644 index 000000000..dc5cedab9 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_a16_H +#define INCLUDED_volk_32u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h deleted file mode 100644 index dc5cedab9..000000000 --- a/volk/include/volk/volk_32u_byteswap_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32u_byteswap_a16_H -#define INCLUDED_volk_32u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/include/volk/volk_32u_popcnt_a.h new file mode 100644 index 000000000..0d8b48fd5 --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_a.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H +#define INCLUDED_VOLK_32u_POPCNT_A16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h deleted file mode 100644 index 0d8b48fd5..000000000 --- a/volk/include/volk/volk_32u_popcnt_a16.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H -#define INCLUDED_VOLK_32u_POPCNT_A16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { - - // This is faster than a lookup table - uint32_t retVal = value; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - - *ret = retVal; -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE4_2 - -#include - -static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { - *ret = _mm_popcnt_u32(value); -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h new file mode 100644 index 000000000..cfcdbdc3a --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_a16_H +#define INCLUDED_volk_64f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h deleted file mode 100644 index cfcdbdc3a..000000000 --- a/volk/include/volk/volk_64f_convert_32f_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a16_H -#define INCLUDED_volk_64f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h new file mode 100644 index 000000000..21f488bf7 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H +#define INCLUDED_volk_64f_x2_max_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h deleted file mode 100644 index 21f488bf7..000000000 --- a/volk/include/volk/volk_64f_x2_max_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H -#define INCLUDED_volk_64f_x2_max_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h new file mode 100644 index 000000000..8711a0eae --- /dev/null +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H +#define INCLUDED_volk_64f_x2_min_64f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h deleted file mode 100644 index 8711a0eae..000000000 --- a/volk/include/volk/volk_64f_x2_min_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H -#define INCLUDED_volk_64f_x2_min_64f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h new file mode 100644 index 000000000..b4bed8451 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_a16_H +#define INCLUDED_volk_64u_byteswap_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h deleted file mode 100644 index b4bed8451..000000000 --- a/volk/include/volk/volk_64u_byteswap_a16.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_volk_64u_byteswap_a16_H -#define INCLUDED_volk_64u_byteswap_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h new file mode 100644 index 000000000..8b92e91a1 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -0,0 +1,50 @@ +#ifndef INCLUDED_volk_64u_popcnt_a16_H +#define INCLUDED_volk_64u_popcnt_a16_H + +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 && LV_HAVE_64 + +#include + +static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { + *ret = _mm_popcnt_u64(value); + +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h deleted file mode 100644 index 8b92e91a1..000000000 --- a/volk/include/volk/volk_64u_popcnt_a16.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef INCLUDED_volk_64u_popcnt_a16_H -#define INCLUDED_volk_64u_popcnt_a16_H - -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { - - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; - -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 && LV_HAVE_64 - -#include - -static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { - *ret = _mm_popcnt_u64(value); - -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/include/volk/volk_8i_convert_16i_a.h new file mode 100644 index 000000000..260ac40a1 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_a.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8i_convert_16i_a16_H +#define INCLUDED_volk_8i_convert_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h deleted file mode 100644 index 260ac40a1..000000000 --- a/volk/include/volk/volk_8i_convert_16i_a16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_volk_8i_convert_16i_a16_H -#define INCLUDED_volk_8i_convert_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h new file mode 100644 index 000000000..9991b150e --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H +#define INCLUDED_volk_8i_s32f_convert_32f_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h deleted file mode 100644 index 9991b150e..000000000 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H -#define INCLUDED_volk_8i_s32f_convert_32f_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h new file mode 100644 index 000000000..249acab49 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h deleted file mode 100644 index 249acab49..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h new file mode 100644 index 000000000..7b64b37c5 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h deleted file mode 100644 index 7b64b37c5..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h new file mode 100644 index 000000000..a1abad487 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#ifdef LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h deleted file mode 100644 index a1abad487..000000000 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#ifdef LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h new file mode 100644 index 000000000..7d778796e --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -0,0 +1,165 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h deleted file mode 100644 index 7d778796e..000000000 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h +++ /dev/null @@ -1,165 +0,0 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h new file mode 100644 index 000000000..a2e0cd8de --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index a2e0cd8de..000000000 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h new file mode 100644 index 000000000..7307ae484 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -0,0 +1,101 @@ +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h deleted file mode 100644 index 7307ae484..000000000 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H -#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..adc7c0599 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h deleted file mode 100644 index adc7c0599..000000000 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ -- cgit From 5f145a323af39c631e55e760e69a797126a54eb3 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:20:50 -0700 Subject: Volk: rename functions _a instead of _a16 --- volk/include/volk/volk_16i_branch_4_state_8_a.h | 10 +++++----- volk/include/volk/volk_16i_convert_8i_a.h | 10 +++++----- volk/include/volk/volk_16i_max_star_16i_a.h | 10 +++++----- volk/include/volk/volk_16i_max_star_horizontal_16i_a.h | 10 +++++----- volk/include/volk/volk_16i_permute_and_scalar_add_a.h | 10 +++++----- volk/include/volk/volk_16i_s32f_convert_32f_a.h | 12 ++++++------ volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h | 18 +++++++++--------- volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h | 18 +++++++++--------- volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h | 18 +++++++++--------- volk/include/volk/volk_16ic_deinterleave_real_16i_a.h | 12 ++++++------ volk/include/volk/volk_16ic_deinterleave_real_8i_a.h | 16 ++++++++-------- volk/include/volk/volk_16ic_magnitude_16i_a.h | 18 +++++++++--------- .../volk/volk_16ic_s32f_deinterleave_32f_x2_a.h | 16 ++++++++-------- .../volk/volk_16ic_s32f_deinterleave_real_32f_a.h | 12 ++++++------ volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_16u_byteswap_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_accumulator_s32f_a.h | 10 +++++----- volk/include/volk/volk_32f_convert_64f_a.h | 10 +++++----- volk/include/volk/volk_32f_index_max_16u_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h | 10 +++++----- .../volk_32f_s32f_calc_spectral_noise_floor_32f_a.h | 10 +++++----- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_convert_32i_a.h | 14 +++++++------- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_normalize_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_s32f_power_32f_a.h | 12 ++++++------ volk/include/volk/volk_32f_s32f_stddev_32f_a.h | 12 ++++++------ volk/include/volk/volk_32f_sqrt_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h | 12 ++++++------ volk/include/volk/volk_32f_x2_add_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_divide_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 14 +++++++------- volk/include/volk/volk_32f_x2_interleave_32fc_a.h | 10 +++++----- volk/include/volk/volk_32f_x2_max_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_min_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x2_multiply_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h | 12 ++++++------ volk/include/volk/volk_32f_x2_subtract_32f_a.h | 16 ++++++++-------- volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_32f_multiply_32fc_a.h | 16 ++++++++-------- volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_real_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_deinterleave_real_64f_a.h | 10 +++++----- volk/include/volk/volk_32fc_index_max_16u_a.h | 10 +++++----- volk/include/volk/volk_32fc_magnitude_32f_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_s32f_atan2_32f_a.h | 12 ++++++------ .../volk/volk_32fc_s32f_deinterleave_real_16i_a.h | 10 +++++----- volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_s32f_power_32fc_a.h | 16 ++++++++-------- .../include/volk/volk_32fc_s32f_power_spectrum_32f_a.h | 10 +++++----- .../volk_32fc_s32f_x2_power_spectral_density_32f_a.h | 10 +++++----- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h | 12 ++++++------ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 18 +++++++++--------- volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 16 ++++++++-------- .../volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h | 10 +++++----- volk/include/volk/volk_32fc_x2_square_dist_32f_a.h | 10 +++++----- volk/include/volk/volk_32i_s32f_convert_32f_a.h | 10 +++++----- volk/include/volk/volk_32i_x2_and_32i_a.h | 16 ++++++++-------- volk/include/volk/volk_32i_x2_or_32i_a.h | 16 ++++++++-------- volk/include/volk/volk_32u_byteswap_a.h | 10 +++++----- volk/include/volk/volk_32u_popcnt_a.h | 4 ++-- volk/include/volk/volk_64f_convert_32f_a.h | 10 +++++----- volk/include/volk/volk_64f_x2_max_64f_a.h | 10 +++++----- volk/include/volk/volk_64f_x2_min_64f_a.h | 10 +++++----- volk/include/volk/volk_64u_byteswap_a.h | 10 +++++----- volk/include/volk/volk_64u_popcnt_a.h | 10 +++++----- volk/include/volk/volk_8i_convert_16i_a.h | 14 +++++++------- volk/include/volk/volk_8i_s32f_convert_32f_a.h | 14 +++++++------- volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h | 10 +++++----- volk/include/volk/volk_8ic_deinterleave_real_16i_a.h | 10 +++++----- volk/include/volk/volk_8ic_deinterleave_real_8i_a.h | 4 ++-- .../include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h | 12 ++++++------ .../volk/volk_8ic_s32f_deinterleave_real_32f_a.h | 12 ++++++------ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a.h | 10 +++++----- .../volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h | 10 +++++----- 76 files changed, 477 insertions(+), 477 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h index 5eb03b346..0424e66e9 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H -#define INCLUDED_volk_16i_branch_4_state_8_a16_H +#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H +#define INCLUDED_volk_16i_branch_4_state_8_a_H #include @@ -14,7 +14,7 @@ #include #include -static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; @@ -138,7 +138,7 @@ static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* s #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; int bound = 4; @@ -191,4 +191,4 @@ static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ +#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h index 4d51e5903..8046035c7 100644 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a16_H -#define INCLUDED_volk_16i_convert_8i_a16_H +#ifndef INCLUDED_volk_16i_convert_8i_a_H +#define INCLUDED_volk_16i_convert_8i_a_H #include #include @@ -12,7 +12,7 @@ \param outputVector The 8 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -52,7 +52,7 @@ static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int1 \param outputVector The 8 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -66,4 +66,4 @@ static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const i -#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ +#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h index 063444279..6a4f63708 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_max_star_16i_a16_H -#define INCLUDED_volk_16i_max_star_16i_a16_H +#ifndef INCLUDED_volk_16i_max_star_16i_a_H +#define INCLUDED_volk_16i_max_star_16i_a_H #include @@ -12,7 +12,7 @@ #include #include -static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) { @@ -87,7 +87,7 @@ static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) { int i = 0; @@ -105,4 +105,4 @@ static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index ece6adb40..f60b33a41 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H -#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H #include @@ -12,7 +12,7 @@ #include #include -static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; @@ -110,7 +110,7 @@ static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { int i = 0; @@ -127,4 +127,4 @@ static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h index ae1a18157..de36cee80 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a_H #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -117,7 +117,7 @@ static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, sho #ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { int i = 0; @@ -136,4 +136,4 @@ static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, sh #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h index 09bc252f0..0555fdf00 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H -#define INCLUDED_volk_16i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H +#define INCLUDED_volk_16i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -68,7 +68,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, con \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -102,7 +102,7 @@ static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -116,4 +116,4 @@ static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, co -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h index 94e5eb986..2688aff04 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H -#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H #include @@ -13,7 +13,7 @@ #include -static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { +static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { @@ -96,9 +96,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* /*asm volatile ( - "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" - "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" "movaps (%[src0]), %%xmm1\n\t" "movaps (%[src1]), %%xmm2\n\t" @@ -143,9 +143,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* "movaps %%xmm1, (%[target])\n\t" "addw $16, %[target]\n\t" - "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" - "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) : @@ -168,7 +168,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { +static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { int i = 0; @@ -188,4 +188,4 @@ static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, shor #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h index c157bf64a..e4c9f17ed 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H -#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; @@ -65,9 +65,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* } /*asm volatile ( - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" - "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" "movaps (%[src0]), %%xmm1\n\t" "movaps (%[src1]), %%xmm2\n\t" "movaps (%[src2]), %%xmm3\n\t" @@ -91,8 +91,8 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* "add $16, %[target1]\n\t" "add $16, %[target2]\n\t" "add $16, %[target3]\n\t" - "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" - ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -113,7 +113,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { int i = 0; @@ -133,4 +133,4 @@ static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short -#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h index 227a92303..cdd60235e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -61,7 +61,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int1 \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -148,11 +148,11 @@ static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, i \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h index 35d0e8be2..2b99e068e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -55,7 +55,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, co \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -117,4 +117,4 @@ static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, -#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h index bdf5fc162..cd2fabb52 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H -#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -84,11 +84,11 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h index 73c6f3390..a6951e967 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H -#define INCLUDED_volk_16ic_magnitude_16i_a16_H +#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H +#define INCLUDED_volk_16ic_magnitude_16i_a_H #include #include @@ -14,7 +14,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -85,7 +85,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, co \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, con \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -181,11 +181,11 @@ static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ +#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h index e4a9015b4..e73d405e0 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, fl \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -99,11 +99,11 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h index 993445995..1630cb0ed 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -62,7 +62,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffe \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; @@ -123,4 +123,4 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuff -#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h index a136c0535..35406e2cb 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -80,7 +80,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -170,11 +170,11 @@ static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVect \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h index f393c05c5..75c7ef0f3 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16u_byteswap_a16_H -#define INCLUDED_volk_16u_byteswap_a16_H +#ifndef INCLUDED_volk_16u_byteswap_a_H +#define INCLUDED_volk_16u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ unsigned int number = 0; uint16_t* inputPtr = intsToSwap; __m128i input, left, right, output; @@ -49,7 +49,7 @@ static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){ unsigned int point; uint16_t* inputPtr = intsToSwap; for(point = 0; point < num_points; point++){ @@ -67,11 +67,11 @@ static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_16u_byteswap_a16_H */ +#endif /* INCLUDED_volk_16u_byteswap_a_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h index dd24a1e29..7ce0d1c80 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H -#define INCLUDED_volk_32f_accumulator_s32f_a16_H +#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H +#define INCLUDED_volk_32f_accumulator_s32f_a_H #include #include @@ -13,7 +13,7 @@ \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){ const float* aPtr = inputBuffer; unsigned int number = 0; float returnValue = 0; @@ -65,4 +65,4 @@ static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const fl -#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ +#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h index 8ca83220b..dda646409 100644 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a16_H -#define INCLUDED_volk_32f_convert_64f_a16_H +#ifndef INCLUDED_volk_32f_convert_64f_a_H +#define INCLUDED_volk_32f_convert_64f_a_H #include #include @@ -12,7 +12,7 @@ \param fVector The float vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ +static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const flo \param fVector The float vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ +static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ double* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -67,4 +67,4 @@ static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const -#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ +#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h index af1f35348..3e0cf1d65 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_index_max_16u_a16_H -#define INCLUDED_volk_32f_index_max_16u_a16_H +#ifndef INCLUDED_volk_32f_index_max_16u_a_H +#define INCLUDED_volk_32f_index_max_16u_a_H #include #include @@ -9,7 +9,7 @@ #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -67,7 +67,7 @@ static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const #ifdef LV_HAVE_SSE #include -static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const fl #endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; unsigned int index = 0; @@ -146,4 +146,4 @@ static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, cons #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ +#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h index 6efd21a37..b25df75a1 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H -#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H #include #include @@ -14,7 +14,7 @@ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_noints The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } @@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_points The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } @@ -117,4 +117,4 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVect -#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h index f5b388e6d..b1902a8c0 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H -#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H #include #include @@ -17,7 +17,7 @@ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* no \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ float sumMean = 0.0; unsigned int number; // find the sum (for mean), etc @@ -165,4 +165,4 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float -#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 4acd2e13e..0a2b4f0f2 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H -#define INCLUDED_volk_32f_s32f_convert_16i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_32f_s32f_convert_16i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -54,7 +54,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, con \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -94,7 +94,7 @@ static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -108,4 +108,4 @@ static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 3f5044313..aa370e614 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H -#define INCLUDED_volk_32f_s32f_convert_32i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H +#define INCLUDED_volk_32f_s32f_convert_32i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -50,7 +50,7 @@ static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -86,7 +86,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, con \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -126,7 +126,7 @@ static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, cons \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -140,4 +140,4 @@ static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, -#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index c114ea38f..8d87a07d7 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H -#define INCLUDED_volk_32f_s32f_convert_8i_a16_H +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H +#define INCLUDED_volk_32f_s32f_convert_8i_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -61,7 +61,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -101,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const \param scalar The value multiplied against each point in the input buffer \param num_points The number of data values to be converted */ -static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -115,4 +115,4 @@ static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, co -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/include/volk/volk_32f_s32f_normalize_a.h index e6195cd32..f5fd0d1db 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a.h +++ b/volk/include/volk/volk_32f_s32f_normalize_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H -#define INCLUDED_volk_32f_s32f_normalize_a16_H +#ifndef INCLUDED_volk_32f_s32f_normalize_a_H +#define INCLUDED_volk_32f_s32f_normalize_a_H #include #include @@ -12,7 +12,7 @@ \param num_points The number of values in vecBuffer \param scalar The scale value to be applied to each buffer value */ -static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; @@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; const float invScalar = 1.0 / scalar; @@ -68,14 +68,14 @@ static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const f \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); + volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h index ecff901e1..c4fa31bd1 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H -#define INCLUDED_volk_32f_s32f_power_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H +#define INCLUDED_volk_32f_s32f_power_32f_a_H #include #include @@ -19,7 +19,7 @@ \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -76,7 +76,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const floa \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -141,4 +141,4 @@ static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const flo -#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h index c2b903657..881067bdc 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H -#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a_H #include #include @@ -15,7 +15,7 @@ \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ unsigned int number = 0; @@ -75,7 +75,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const floa \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ unsigned int number = 0; @@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ const float* aPtr = inputBuffer; @@ -142,4 +142,4 @@ static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const flo -#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h index a9ce76f88..e44c73cfd 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H -#define INCLUDED_volk_32f_sqrt_32f_a16_H +#ifndef INCLUDED_volk_32f_sqrt_32f_a_H +#define INCLUDED_volk_32f_sqrt_32f_a_H #include #include @@ -13,7 +13,7 @@ \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVecto \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -59,19 +59,19 @@ static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aV #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); /*! \brief Sqrts the two input vectors and store their results in the third vector \param cVector The vector where the results will be stored \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ +#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h index 10d72e09d..3a82e3d2f 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -88,7 +88,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, flo \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ @@ -167,4 +167,4 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, fl -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h index 2de6a6644..3bc83653b 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H -#define INCLUDED_volk_32f_x2_add_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_add_32f_a_H +#define INCLUDED_volk_32f_x2_add_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVec \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h index 1603e78de..52ddfae87 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H -#define INCLUDED_volk_32f_x2_divide_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H +#define INCLUDED_volk_32f_x2_divide_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* a \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,12 +71,12 @@ static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const floa \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 2cd974070..0c58f2ecf 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H #include #include @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; @@ -28,7 +28,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const fl #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -78,7 +78,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float #include -static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -125,7 +125,7 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float #include -static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -182,4 +182,4 @@ static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const flo #endif /*LV_HAVE_SSE4_1*/ -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h index f3731fa2a..1d4d2dbbd 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H -#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a_H #include #include @@ -13,7 +13,7 @@ \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ unsigned int number = 0; float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; @@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -72,4 +72,4 @@ static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVec -#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h index 60be6e36d..7948c458d 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H -#define INCLUDED_volk_32f_x2_max_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_max_32f_a_H +#define INCLUDED_volk_32f_x2_max_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVec \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -75,11 +75,11 @@ static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h index 3b8291531..d77134868 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H -#define INCLUDED_volk_32f_x2_min_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_min_32f_a_H +#define INCLUDED_volk_32f_x2_min_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVec \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -75,11 +75,11 @@ static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h index 885941abf..fae9a652f 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H -#define INCLUDED_volk_32f_x2_multiply_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H +#define INCLUDED_volk_32f_x2_multiply_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -52,7 +52,7 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; @@ -90,7 +90,7 @@ static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -110,11 +110,11 @@ static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const fl \param bVector One of the vectors to be multiplied \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h index f7ad3fd18..cc02c3678 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H -#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -73,7 +73,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexV \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVe \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ int16_t* complexVectorPtr = (int16_t*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; @@ -153,4 +153,4 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* compl -#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h index c01f2c1f3..16cad008a 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H -#define INCLUDED_volk_32f_x2_subtract_32f_a16_H +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H +#define INCLUDED_volk_32f_x2_subtract_32f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const fl \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h index 6e446cbef..2ea8fa96d 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H #include #include @@ -13,7 +13,7 @@ #include #include -static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { +static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { float result = 0.0; @@ -100,7 +100,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* sr #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { +static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { @@ -148,4 +148,4 @@ static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h index 846315a4a..b7350b9fa 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vectors containing the float values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; @@ -84,12 +84,12 @@ static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, c \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h index 3e7c3fa28..9de036ef4 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -72,4 +72,4 @@ static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, flo -#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h index 945a26742..29c369d9a 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; @@ -59,7 +59,7 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, doubl \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; @@ -75,4 +75,4 @@ static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, do -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h index 3c3fb2583..a1d0fd5d1 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; @@ -65,4 +65,4 @@ static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, c -#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h index 40c1a7a46..70a3b1971 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; @@ -63,4 +63,4 @@ static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, -#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 0ad1edbe9..312e034e2 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H -#define INCLUDED_volk_32fc_index_max_16u_a16_H +#ifndef INCLUDED_volk_32fc_index_max_16u_a_H +#define INCLUDED_volk_32fc_index_max_16u_a_H #include #include @@ -11,7 +11,7 @@ #include -static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { +static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { @@ -189,7 +189,7 @@ static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32f #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { +static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { float sq_dist = 0.0; float max = 0.0; unsigned int index = 0; @@ -212,4 +212,4 @@ static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_ #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ +#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h index 946190e41..f18e9bc0b 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H -#define INCLUDED_volk_32fc_magnitude_32f_a16_H +#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H +#define INCLUDED_volk_32fc_magnitude_32f_a_H #include #include @@ -13,7 +13,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -56,7 +56,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, cons \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -103,7 +103,7 @@ static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -122,11 +122,11 @@ static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, c \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h index 55b1b6c70..9304b0c28 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H -#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H #include #include @@ -19,7 +19,7 @@ \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* outPtr = outputVector; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, con \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* outPtr = outputVector; @@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.0 / normalizeFactor; @@ -155,4 +155,4 @@ static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, con -#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h index 2460039d2..1c17fb70c 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H #include #include @@ -14,7 +14,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* iBufferPtr = iBuffer; unsigned int number = 0; @@ -78,4 +78,4 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBu -#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h index f67ab0607..38fd609d3 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H #include #include @@ -15,7 +15,7 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -70,7 +70,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVecto \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -149,11 +149,11 @@ static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVe \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h index 155b93ca2..ec1d7167f 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -1,12 +1,12 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a_H #include #include #include //! raise a complex float to a real float power -static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, const float power){ +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power){ const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); return mag*lv_cmake(cosf(arg), sinf(arg)); @@ -26,7 +26,7 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a16(const lv_32fc_t exp, c \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -81,7 +81,7 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l #endif /* LV_HAVE_LIB_SIMDMATH */ for(;number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); } } #endif /* LV_HAVE_SSE */ @@ -94,13 +94,13 @@ static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const l \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; for(number = 0; number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a16((*aPtr++), power); + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); } } #endif /* LV_HAVE_GENERIC */ @@ -108,4 +108,4 @@ static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, con -#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h index 03da069c2..8d1959dae 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H -#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H #include #include @@ -19,7 +19,7 @@ \param normalizationFactor This value is divided against all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ const float* inputPtr = (const float*)complexFFTInput; float* destPtr = logPowerOutput; uint64_t number = 0; @@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOut \param normalizationFactor This value is divided agains all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; @@ -123,4 +123,4 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPower -#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h index 5bcd7f7c4..fc635f171 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H #include #include @@ -20,7 +20,7 @@ \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ const float* inputPtr = (const float*)complexFFTInput; float* destPtr = logPowerOutput; uint64_t number = 0; @@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; @@ -131,4 +131,4 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(floa -#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h index f221237ff..a6c21336d 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #include #include @@ -9,7 +9,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { float * res = (float*) result; float * in = (float*) input; @@ -63,7 +63,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* r #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -204,7 +204,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* resul #endif #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -342,4 +342,4 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* re -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index 9657c8f6b..022a0a614 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H #include #include @@ -10,7 +10,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { float * res = (float*) result; float * in = (float*) input; @@ -60,7 +60,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, con #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { asm @@ -195,7 +195,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, cons #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { asm volatile ( @@ -321,7 +321,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, cons #include -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { lv_32fc_t dotProduct; @@ -378,8 +378,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const #include -static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); +static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now /* __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; @@ -466,4 +466,4 @@ static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, cons #endif /*LV_HAVE_SSE4_1*/ -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 72010b855..18dd092e8 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H #include #include @@ -15,7 +15,7 @@ \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -61,7 +61,7 @@ static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const lv_32fc_t* bPtr= bVector; @@ -81,9 +81,9 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -91,4 +91,4 @@ static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h index 910f51679..be7a4ffe9 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H #include #include @@ -10,7 +10,7 @@ #include #include -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -106,7 +106,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; int i = 0; @@ -123,4 +123,4 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(flo #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h index 551f3cb53..c21d00491 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H #include #include @@ -9,7 +9,7 @@ #include #include -static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -92,7 +92,7 @@ static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; int i = 0; @@ -109,4 +109,4 @@ static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32 #endif /*LV_HAVE_GENERIC*/ -#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h index b744c7197..558142869 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H -#define INCLUDED_volk_32i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H +#define INCLUDED_volk_32i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -55,7 +55,7 @@ static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int32_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -70,4 +70,4 @@ static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, co -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h index 4d50efd32..dcd63d98e 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H -#define INCLUDED_volk_32i_x2_and_32i_a16_H +#ifndef INCLUDED_volk_32i_x2_and_32i_a_H +#define INCLUDED_volk_32i_x2_and_32i_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32 \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ +#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h index 9edbdbafd..243e8178c 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H -#define INCLUDED_volk_32i_x2_or_32i_a16_H +#ifndef INCLUDED_volk_32i_x2_or_32i_a_H +#define INCLUDED_volk_32i_x2_or_32i_a_H #include #include @@ -13,7 +13,7 @@ \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* a \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -71,11 +71,11 @@ static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_ \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ -#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ +#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h index dc5cedab9..b88848096 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32u_byteswap_a16_H -#define INCLUDED_volk_32u_byteswap_a16_H +#ifndef INCLUDED_volk_32u_byteswap_a_H +#define INCLUDED_volk_32u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){ unsigned int number = 0; uint32_t* inputPtr = intsToSwap; @@ -57,7 +57,7 @@ static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = intsToSwap; unsigned int point; @@ -74,4 +74,4 @@ static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned -#endif /* INCLUDED_volk_32u_byteswap_a16_H */ +#endif /* INCLUDED_volk_32u_byteswap_a_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/include/volk/volk_32u_popcnt_a.h index 0d8b48fd5..b72d605c6 100644 --- a/volk/include/volk/volk_32u_popcnt_a.h +++ b/volk/include/volk/volk_32u_popcnt_a.h @@ -7,7 +7,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) { // This is faster than a lookup table uint32_t retVal = value; @@ -27,7 +27,7 @@ static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t val #include -static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) { *ret = _mm_popcnt_u32(value); } diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h index cfcdbdc3a..2126e4f95 100644 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a16_H -#define INCLUDED_volk_64f_convert_32f_a16_H +#ifndef INCLUDED_volk_64f_convert_32f_a_H +#define INCLUDED_volk_64f_convert_32f_a_H #include #include @@ -12,7 +12,7 @@ \param fVector The double vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ +static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -50,7 +50,7 @@ static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const doub \param fVector The double vector values to be converted \param num_points The number of points in the two vectors to be converted */ -static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ +static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ float* outputVectorPtr = outputVector; const double* inputVectorPtr = inputVector; unsigned int number = 0; @@ -64,4 +64,4 @@ static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const d -#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ +#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h index 21f488bf7..61a704c52 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H -#define INCLUDED_volk_64f_x2_max_64f_a16_H +#ifndef INCLUDED_volk_64f_x2_max_64f_a_H +#define INCLUDED_volk_64f_x2_max_64f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* a \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; @@ -68,4 +68,4 @@ static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ +#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h index 8711a0eae..148b72c59 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H -#define INCLUDED_volk_64f_x2_min_64f_a16_H +#ifndef INCLUDED_volk_64f_x2_min_64f_a_H +#define INCLUDED_volk_64f_x2_min_64f_a_H #include #include @@ -13,7 +13,7 @@ \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* a \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; @@ -68,4 +68,4 @@ static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ +#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h index b4bed8451..d4fc74a6e 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64u_byteswap_a16_H -#define INCLUDED_volk_64u_byteswap_a16_H +#ifndef INCLUDED_volk_64u_byteswap_a_H +#define INCLUDED_volk_64u_byteswap_a_H #include #include @@ -12,7 +12,7 @@ \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = (uint32_t*)intsToSwap; __m128i input, byte1, byte2, byte3, byte4, output; __m128i byte2mask = _mm_set1_epi32(0x00FF0000); @@ -65,7 +65,7 @@ static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){ uint32_t* inputPtr = (uint32_t*)intsToSwap; unsigned int point; for(point = 0; point < num_points; point++){ @@ -85,4 +85,4 @@ static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned -#endif /* INCLUDED_volk_64u_byteswap_a16_H */ +#endif /* INCLUDED_volk_64u_byteswap_a_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 8b92e91a1..bdaa98643 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_64u_popcnt_a16_H -#define INCLUDED_volk_64u_popcnt_a16_H +#ifndef INCLUDED_volk_64u_popcnt_a_H +#define INCLUDED_volk_64u_popcnt_a_H #include #include @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { const uint32_t* valueVector = (const uint32_t*)&value; @@ -40,11 +40,11 @@ static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t val #include -static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) { *ret = _mm_popcnt_u64(value); } #endif /*LV_HAVE_SSE4_2*/ -#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ +#endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/include/volk/volk_8i_convert_16i_a.h index 260ac40a1..9104f90cb 100644 --- a/volk/include/volk/volk_8i_convert_16i_a.h +++ b/volk/include/volk/volk_8i_convert_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8i_convert_16i_a16_H -#define INCLUDED_volk_8i_convert_16i_a16_H +#ifndef INCLUDED_volk_8i_convert_16i_a_H +#define INCLUDED_volk_8i_convert_16i_a_H #include #include @@ -13,7 +13,7 @@ \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -54,7 +54,7 @@ static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const i \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const int8_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -72,9 +72,9 @@ static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h index 9991b150e..7f2623ac6 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H -#define INCLUDED_volk_8i_s32f_convert_32f_a16_H +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H +#define INCLUDED_volk_8i_s32f_convert_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; @@ -74,7 +74,7 @@ static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, cons \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int8_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -94,10 +94,10 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con \param scalar The value divided against each point in the output buffer \param num_points The number of data values to be converted */ -extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); + volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h index 249acab49..8f13da32f 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H -#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H #include #include @@ -13,7 +13,7 @@ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -74,4 +74,4 @@ static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, in -#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h index 7b64b37c5..d26b3d0d0 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H -#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H #include #include @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, c \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; @@ -63,4 +63,4 @@ static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, -#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h index a1abad487..21efed83e 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h @@ -12,7 +12,7 @@ \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, cons \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h index 7d778796e..b723c6f8b 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H #include #include @@ -15,7 +15,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -85,7 +85,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, flo \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -162,4 +162,4 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h index a2e0cd8de..74073f5a6 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H #include #include @@ -14,7 +14,7 @@ \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -71,7 +71,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ float* iBufferPtr = iBuffer; unsigned int number = 0; @@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, c \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; @@ -131,4 +131,4 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffe -#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 7307ae484..0bb76f1d1 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H -#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H #include #include @@ -14,7 +14,7 @@ \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVe \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; int16_t* c16Ptr = (int16_t*)cVector; int8_t* a8Ptr = (int8_t*)aVector; @@ -98,4 +98,4 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cV -#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index adc7c0599..3e05608a4 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H #include #include @@ -14,7 +14,7 @@ \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; float* cPtr = (float*)cVector; const float invScalar = 1.0 / scalar; @@ -119,4 +119,4 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_ -#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */ -- cgit From c80e7b00de836a388013a002008c7256481414cb Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 18 May 2011 13:39:57 -0700 Subject: Volk: renamed everything else to _a instead of _a16, makefiles included --- volk/include/volk/Makefile.am | 152 +++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 76 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 5f9e134bc..b7da9b37c 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -33,91 +33,91 @@ volkinclude_HEADERS = \ $(top_gendir)/include/volk/volk_typedefs.h \ $(top_gendir)/include/volk/volk.h \ $(top_gendir)/include/volk/volk_cpu.h \ - volk_16i_x5_add_quad_16i_x4_a16.h \ - volk_16i_branch_4_state_8_a16.h \ - volk_16ic_deinterleave_16i_x2_a16.h \ - volk_16ic_s32f_deinterleave_32f_x2_a16.h \ - volk_16ic_deinterleave_real_16i_a16.h \ - volk_16ic_s32f_deinterleave_real_32f_a16.h \ - volk_16ic_deinterleave_real_8i_a16.h \ - volk_16ic_magnitude_16i_a16.h \ - volk_16ic_s32f_magnitude_32f_a16.h \ - volk_16i_s32f_convert_32f_a16.h \ + volk_16i_x5_add_quad_16i_x4_a.h \ + volk_16i_branch_4_state_8_a.h \ + volk_16ic_deinterleave_16i_x2_a.h \ + volk_16ic_s32f_deinterleave_32f_x2_a.h \ + volk_16ic_deinterleave_real_16i_a.h \ + volk_16ic_s32f_deinterleave_real_32f_a.h \ + volk_16ic_deinterleave_real_8i_a.h \ + volk_16ic_magnitude_16i_a.h \ + volk_16ic_s32f_magnitude_32f_a.h \ + volk_16i_s32f_convert_32f_a.h \ volk_16i_s32f_convert_32f_u.h \ - volk_16i_convert_8i_a16.h \ + volk_16i_convert_8i_a.h \ volk_16i_convert_8i_u.h \ - volk_16i_max_star_16i_a16.h \ - volk_16i_max_star_horizontal_16i_a16.h \ - volk_16i_permute_and_scalar_add_a16.h \ - volk_16i_x4_quad_max_star_16i_a16.h \ - volk_16u_byteswap_a16.h \ - volk_32f_accumulator_s32f_a16.h \ - volk_32f_x2_add_32f_a16.h \ - volk_32fc_32f_multiply_32fc_a16.h \ - volk_32fc_s32f_power_32fc_a16.h \ - volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ - volk_32fc_s32f_atan2_32f_a16.h \ - volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_16i_max_star_16i_a.h \ + volk_16i_max_star_horizontal_16i_a.h \ + volk_16i_permute_and_scalar_add_a.h \ + volk_16i_x4_quad_max_star_16i_a.h \ + volk_16u_byteswap_a.h \ + volk_32f_accumulator_s32f_a.h \ + volk_32f_x2_add_32f_a.h \ + volk_32fc_32f_multiply_32fc_a.h \ + volk_32fc_s32f_power_32fc_a.h \ + volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ + volk_32fc_s32f_atan2_32f_a.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_a.h \ volk_32fc_x2_conjugate_dot_prod_32fc_u.h \ - volk_32fc_deinterleave_32f_x2_a16.h \ - volk_32fc_deinterleave_64f_x2_a16.h \ - volk_32fc_s32f_deinterleave_real_16i_a16.h \ - volk_32fc_deinterleave_real_32f_a16.h \ - volk_32fc_deinterleave_real_64f_a16.h \ - volk_32fc_x2_dot_prod_32fc_a16.h \ - volk_32fc_index_max_16u_a16.h \ - volk_32fc_s32f_magnitude_16i_a16.h \ - volk_32fc_magnitude_32f_a16.h \ - volk_32fc_x2_multiply_32fc_a16.h \ - volk_32f_s32f_convert_16i_a16.h \ + volk_32fc_deinterleave_32f_x2_a.h \ + volk_32fc_deinterleave_64f_x2_a.h \ + volk_32fc_s32f_deinterleave_real_16i_a.h \ + volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_real_64f_a.h \ + volk_32fc_x2_dot_prod_32fc_a.h \ + volk_32fc_index_max_16u_a.h \ + volk_32fc_s32f_magnitude_16i_a.h \ + volk_32fc_magnitude_32f_a.h \ + volk_32fc_x2_multiply_32fc_a.h \ + volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ - volk_32f_s32f_convert_32i_a16.h \ + volk_32f_s32f_convert_32i_a.h \ volk_32f_s32f_convert_32i_u.h \ - volk_32f_convert_64f_a16.h \ + volk_32f_convert_64f_a.h \ volk_32f_convert_64f_u.h \ - volk_32f_s32f_convert_8i_a16.h \ + volk_32f_s32f_convert_8i_a.h \ volk_32f_s32f_convert_8i_u.h \ - volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \ - volk_32fc_s32f_power_spectrum_32f_a16.h \ - volk_32fc_x2_square_dist_32f_a16.h \ - volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \ - volk_32f_x2_divide_32f_a16.h \ - volk_32f_x2_dot_prod_32f_a16.h \ + volk_32fc_s32f_x2_power_spectral_density_32f_a.h \ + volk_32fc_s32f_power_spectrum_32f_a.h \ + volk_32fc_x2_square_dist_32f_a.h \ + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h \ + volk_32f_x2_divide_32f_a.h \ + volk_32f_x2_dot_prod_32f_a.h \ volk_32f_x2_dot_prod_32f_u.h \ - volk_32f_s32f_32f_fm_detect_32f_a16.h \ - volk_32f_index_max_16u_a16.h \ - volk_32f_x2_s32f_interleave_16ic_a16.h \ - volk_32f_x2_interleave_32fc_a16.h \ - volk_32f_x2_max_32f_a16.h \ - volk_32f_x2_min_32f_a16.h \ - volk_32f_x2_multiply_32f_a16.h \ - volk_32f_s32f_normalize_a16.h \ - volk_32f_s32f_power_32f_a16.h \ - volk_32f_sqrt_32f_a16.h \ - volk_32f_s32f_stddev_32f_a16.h \ - volk_32f_stddev_and_mean_32f_x2_a16.h \ - volk_32f_x2_subtract_32f_a16.h \ - volk_32f_x3_sum_of_poly_32f_a16.h \ - volk_32i_x2_and_32i_a16.h \ - volk_32i_s32f_convert_32f_a16.h \ + volk_32f_s32f_32f_fm_detect_32f_a.h \ + volk_32f_index_max_16u_a.h \ + volk_32f_x2_s32f_interleave_16ic_a.h \ + volk_32f_x2_interleave_32fc_a.h \ + volk_32f_x2_max_32f_a.h \ + volk_32f_x2_min_32f_a.h \ + volk_32f_x2_multiply_32f_a.h \ + volk_32f_s32f_normalize_a.h \ + volk_32f_s32f_power_32f_a.h \ + volk_32f_sqrt_32f_a.h \ + volk_32f_s32f_stddev_32f_a.h \ + volk_32f_stddev_and_mean_32f_x2_a.h \ + volk_32f_x2_subtract_32f_a.h \ + volk_32f_x3_sum_of_poly_32f_a.h \ + volk_32i_x2_and_32i_a.h \ + volk_32i_s32f_convert_32f_a.h \ volk_32i_s32f_convert_32f_u.h \ - volk_32i_x2_or_32i_a16.h \ - volk_32u_byteswap_a16.h \ - volk_32u_popcnt_a16.h \ - volk_64f_convert_32f_a16.h \ + volk_32i_x2_or_32i_a.h \ + volk_32u_byteswap_a.h \ + volk_32u_popcnt_a.h \ + volk_64f_convert_32f_a.h \ volk_64f_convert_32f_u.h \ - volk_64f_x2_max_64f_a16.h \ - volk_64f_x2_min_64f_a16.h \ - volk_64u_byteswap_a16.h \ - volk_64u_popcnt_a16.h \ - volk_8ic_deinterleave_16i_x2_a16.h \ - volk_8ic_s32f_deinterleave_32f_x2_a16.h \ - volk_8ic_deinterleave_real_16i_a16.h \ - volk_8ic_s32f_deinterleave_real_32f_a16.h \ - volk_8ic_deinterleave_real_8i_a16.h \ - volk_8ic_x2_multiply_conjugate_16ic_a16.h \ - volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \ - volk_8i_convert_16i_a16.h \ + volk_64f_x2_max_64f_a.h \ + volk_64f_x2_min_64f_a.h \ + volk_64u_byteswap_a.h \ + volk_64u_popcnt_a.h \ + volk_8ic_deinterleave_16i_x2_a.h \ + volk_8ic_s32f_deinterleave_32f_x2_a.h \ + volk_8ic_deinterleave_real_16i_a.h \ + volk_8ic_s32f_deinterleave_real_32f_a.h \ + volk_8ic_deinterleave_real_8i_a.h \ + volk_8ic_x2_multiply_conjugate_16ic_a.h \ + volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h \ + volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ - volk_8i_s32f_convert_32f_a16.h \ + volk_8i_s32f_convert_32f_a.h \ volk_8i_s32f_convert_32f_u.h -- cgit