From bef3db60e73953f2d2ecdc6a86a81e11df3b103d Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Mon, 13 Dec 2010 19:18:45 -0800
Subject: volk: committed some stuff i neglected
---
volk/include/volk/Makefile.am | 2 +-
volk/include/volk/archs.xml | 4 ++++
volk/include/volk/make_set_simd.py | 3 +++
volk/include/volk/volk_32f_sqrt_aligned16.h | 13 +++++++++++++
4 files changed, 21 insertions(+), 1 deletion(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 04a43bd34..99276ab87 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -179,4 +179,4 @@ distclean-local:
rm -f Makefile.in
rm -f volk_environment_init.h
rm -f volk_mktables
- rm -f $(BUILT_SOURCES)
\ No newline at end of file
+ rm -f $(BUILT_SOURCES)
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index b7c98500f..a828e5ad0 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -5,6 +5,10 @@
none
+
+ lorc-0.4
+
+
maltivec
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index 842366b18..e568aebfa 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -111,6 +111,9 @@ def make_set_simd(dom) :
tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
tempstring = tempstring + " ADDONS=\"\"\n";
tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n";
+ tempstring = tempstring + " if test $HAVE_ORC = yes; then\n";
+ tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n";
+ tempstring = tempstring + " fi\n";
tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n";
tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n";
tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n";
diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h
index 0b2eaf251..f6996ad5f 100644
--- a/volk/include/volk/volk_32f_sqrt_aligned16.h
+++ b/volk/include/volk/volk_32f_sqrt_aligned16.h
@@ -58,6 +58,19 @@ static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float*
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int);
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){
+ volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
--
cgit
From 05f4bced29987a0a573d1fc5b214f3fa01dc84bd Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 13:36:55 -0800
Subject: Volk: More autotools stuff for Orc. Should build OK with or without
Orc now.
---
volk/include/volk/make_set_simd.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index e568aebfa..78a00476d 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -183,8 +183,14 @@ def make_set_simd(dom) :
tempstring = tempstring + " indCXX=no\n"
tempstring = tempstring + " indLV_ARCH=no\n"
elif atype == "all":
- tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
- tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ if arch == "orc":
+ tempstring = tempstring + " if test $HAVE_ORC = yes; then\n";
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " fi\n";
+ else:
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
tempstring = tempstring + " ;;\n"
tempstring = tempstring + " (powerpc)\n"
--
cgit
From be78b530701850b964118fd0f63ba2bbdca9759d Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 14:14:00 -0800
Subject: pre-patch...
---
volk/include/volk/volk_32s_and_aligned16.h | 14 +++++++++++++-
volk/include/volk/volk_8s_convert_16s_aligned16.h | 12 ++++++++++++
volk/include/volk/volk_8s_convert_32f_aligned16.h | 13 +++++++++++++
3 files changed, 38 insertions(+), 1 deletion(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h
index e9f1e3a43..16c63fd48 100644
--- a/volk/include/volk/volk_32s_and_aligned16.h
+++ b/volk/include/volk/volk_32s_and_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h
index 0efe3c6a1..c52c64eae 100644
--- a/volk/include/volk/volk_8s_convert_16s_aligned16.h
+++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h
@@ -65,6 +65,18 @@ static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector,
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h
index 54b66ef8f..700a0fa42 100644
--- a/volk/include/volk/volk_8s_convert_32f_aligned16.h
+++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h
@@ -86,6 +86,19 @@ static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, co
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
--
cgit
From 2e9a7d350713b4e1b21458db8f3fce8a557858ae Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 17:13:40 -0800
Subject: Volk: Added QA tests for all the Orc stuff. Added a 16u_byteswap but
it's broken right now.
---
volk/include/volk/volk_16u_byteswap_aligned16.h | 12 ++++++++++++
volk/include/volk/volk_32f_add_aligned16.h | 14 +++++++++++++-
2 files changed, 25 insertions(+), 1 deletion(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
index 698e958e4..8e628ab17 100644
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -61,5 +61,17 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points);
+static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
+ volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
index 721c60fd6..e7d8de265 100644
--- a/volk/include/volk/volk_32f_add_aligned16.h
+++ b/volk/include/volk/volk_32f_add_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32f_add_aligned16_generic(float* cVector, const float* a
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
--
cgit
From 26415a1445490cc3230c5d793a41703931ae9d01 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 17:23:20 -0800
Subject: Volk: Nick's commits to make adding Orc a little more structurally
sound
---
volk/include/volk/archs.xml | 2 ++
volk/include/volk/make_set_simd.py | 22 ++++++++++------------
2 files changed, 12 insertions(+), 12 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index a828e5ad0..b61bfca09 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -7,6 +7,8 @@
lorc-0.4
+ HAVE_ORC
+ no
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index 78a00476d..d7109cfcb 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -110,10 +110,7 @@ def make_set_simd(dom) :
arch = str(domarch.attributes["name"].value);
tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
tempstring = tempstring + " ADDONS=\"\"\n";
- tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n";
- tempstring = tempstring + " if test $HAVE_ORC = yes; then\n";
- tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n";
- tempstring = tempstring + " fi\n";
+ tempstring = tempstring + " BUILT_ARCHS=\"\"\n";
tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n";
tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n";
tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n";
@@ -183,14 +180,11 @@ def make_set_simd(dom) :
tempstring = tempstring + " indCXX=no\n"
tempstring = tempstring + " indLV_ARCH=no\n"
elif atype == "all":
- if arch == "orc":
- tempstring = tempstring + " if test $HAVE_ORC = yes; then\n";
- tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
- tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
- tempstring = tempstring + " fi\n";
- else:
- tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
- tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+ tempstring = tempstring + " fi\n"
tempstring = tempstring + " ;;\n"
tempstring = tempstring + " (powerpc)\n"
@@ -234,11 +228,15 @@ def make_set_simd(dom) :
tempstring = tempstring + " indCXX=no\n"
tempstring = tempstring + " indLV_ARCH=no\n"
elif atype == "all":
+ tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+ tempstring = tempstring + " fi\n"
tempstring = tempstring + " ;;\n"
tempstring = tempstring + " esac\n"
tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
+ tempstring = tempstring + " AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n";
tempstring = tempstring + "])\n"
return tempstring;
--
cgit
From 21426265324c883c91eeaaf75a81f2ccdc6e249d Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 21:12:49 -0800
Subject: Volk: Build fixes to work with/without Orc.
---
volk/include/volk/archs.xml | 2 +-
volk/include/volk/make_set_simd.py | 51 +++++++++++++++++++++++++++++++++++---
2 files changed, 48 insertions(+), 5 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index b61bfca09..a19a5add9 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -7,7 +7,7 @@
lorc-0.4
- HAVE_ORC
+ LV_HAVE_ORC
no
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index d7109cfcb..f2b7c0656 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -180,11 +180,22 @@ def make_set_simd(dom) :
tempstring = tempstring + " indCXX=no\n"
tempstring = tempstring + " indLV_ARCH=no\n"
elif atype == "all":
- tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+ tempstring = tempstring + " for i in $cf_with_lv_arch\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n";
+ tempstring = tempstring + " indLV_ARCH=yes\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n"
tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+
tempstring = tempstring + " ;;\n"
tempstring = tempstring + " (powerpc)\n"
@@ -228,11 +239,44 @@ def make_set_simd(dom) :
tempstring = tempstring + " indCXX=no\n"
tempstring = tempstring + " indLV_ARCH=no\n"
elif atype == "all":
- tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+ tempstring = tempstring + " for i in $cf_with_lv_arch\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n";
+ tempstring = tempstring + " indLV_ARCH=yes\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n"
+ tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+ tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
+ tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ tempstring = tempstring + " ;;\n"
+ tempstring = tempstring + " (*)\n"
+ for domarch in dom:
+ arch = str(domarch.attributes["name"].value);
+ atype = str(domarch.attributes["type"].value);
+ flag = domarch.getElementsByTagName("flag");
+ flag = str(flag[0].firstChild.data);
+ if atype == "all":
+ tempstring = tempstring + " for i in $cf_with_lv_arch\n"
+ tempstring = tempstring + " do\n"
+ tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n";
+ tempstring = tempstring + " indLV_ARCH=yes\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " done\n"
+ tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
+ tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n"
tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n";
tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
tempstring = tempstring + " fi\n"
+ tempstring = tempstring + " indLV_ARCH=no\n"
tempstring = tempstring + " ;;\n"
tempstring = tempstring + " esac\n"
tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
@@ -240,6 +284,5 @@ def make_set_simd(dom) :
tempstring = tempstring + "])\n"
return tempstring;
-
-
+
--
cgit
From f9ee6a55cb397f9302769a25a8c959fa162354f0 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 14 Dec 2010 22:58:33 -0800
Subject: Volk: Some new basic Orc implementations with QA code
---
volk/include/volk/volk_16u_byteswap_aligned16.h | 4 ++--
volk/include/volk/volk_32f_divide_aligned16.h | 13 +++++++++++++
volk/include/volk/volk_32f_multiply_aligned16.h | 14 +++++++++++++-
volk/include/volk/volk_32f_subtract_aligned16.h | 14 ++++++++++++++
4 files changed, 42 insertions(+), 3 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
index 8e628ab17..9d19d1a45 100644
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -67,9 +67,9 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns
\param intsToSwap The vector of data to byte swap
\param numDataPoints The number of data points
*/
-extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points);
+extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
- volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points);
+ volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
index c00700cd8..c595b5e92 100644
--- a/volk/include/volk/volk_32f_divide_aligned16.h
+++ b/volk/include/volk/volk_32f_divide_aligned16.h
@@ -63,6 +63,19 @@ static inline void volk_32f_divide_aligned16_generic(float* cVector, const float
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h
index b557580ab..87ae7bcf8 100644
--- a/volk/include/volk/volk_32f_multiply_aligned16.h
+++ b/volk/include/volk/volk_32f_multiply_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32f_multiply_aligned16_generic(float* cVector, const flo
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h
index ac3f5e5d1..e15242901 100644
--- a/volk/include/volk/volk_32f_subtract_aligned16.h
+++ b/volk/include/volk/volk_32f_subtract_aligned16.h
@@ -63,5 +63,19 @@ static inline void volk_32f_subtract_aligned16_generic(float* cVector, const flo
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */
--
cgit
From 15ad4b5398e474bfb52fdb7e826b69f3e398c0b0 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Wed, 15 Dec 2010 16:27:42 -0800
Subject: Volk: A bunch of new ORC routines plus tests. Also fixed a typo in
the generic version of 16sc_magnitude_16s_a16.
---
volk/include/volk/volk_16sc_magnitude_16s_aligned16.h | 15 +++++++++++++--
volk/include/volk/volk_16sc_magnitude_32f_aligned16.h | 14 +++++++++++++-
volk/include/volk/volk_32fc_magnitude_16s_aligned16.h | 14 +++++++++++++-
volk/include/volk/volk_32fc_magnitude_32f_aligned16.h | 13 ++++++++++++-
volk/include/volk/volk_32s_or_aligned16.h | 14 +++++++++++++-
5 files changed, 64 insertions(+), 6 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
index 1482ab82e..9f3222aa6 100644
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -164,7 +164,7 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
int16_t* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
- const float scalar = 32786.0;
+ const float scalar = 32768.0;
for(number = 0; number < num_points; number++){
float real = ((float)(*complexVectorPtr++)) / scalar;
float imag = ((float)(*complexVectorPtr++)) / scalar;
@@ -173,7 +173,18 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
index 9c2a48835..e063ae432 100644
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -161,7 +161,19 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
index 4e64d8c22..4e590e120 100644
--- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
@@ -140,7 +140,19 @@ static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
index 7a8fd1ef9..3ea62da6a 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
@@ -115,7 +115,18 @@ static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVec
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
+static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h
index f4c427c4d..64748d535 100644
--- a/volk/include/volk/volk_32s_or_aligned16.h
+++ b/volk/include/volk/volk_32s_or_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */
--
cgit
From c6fff77de9b686761f93f0e1de237f8543f5e919 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Fri, 17 Dec 2010 11:14:41 -0800
Subject: Volk: A bunch of new Orc routines plus a couple of build changes.
32fc_magnitude_16s fails test_all right now.
---
volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h | 14 +++++++++++++-
volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h | 15 ++++++++++++++-
.../volk/volk_16sc_deinterleave_real_8s_aligned16.h | 13 ++++++++++++-
volk/include/volk/volk_16sc_magnitude_16s_aligned16.h | 6 +++---
volk/include/volk/volk_16sc_magnitude_32f_aligned16.h | 2 +-
volk/include/volk/volk_32f_max_aligned16.h | 14 ++++++++++++++
volk/include/volk/volk_32f_min_aligned16.h | 14 ++++++++++++++
7 files changed, 71 insertions(+), 7 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
index 32e13df98..cf94a3f38 100644
--- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
@@ -140,7 +140,19 @@ static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
index 86f67437d..50b8b62d5 100644
--- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
@@ -89,7 +89,20 @@ static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer,
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
index c0d1e941a..2dd85a422 100644
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
@@ -77,7 +77,18 @@ static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuf
}
#endif /* LV_HAVE_GENERIC */
-
+#if LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
index 9f3222aa6..41e8751d6 100644
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -173,16 +173,16 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
}
#endif /* LV_HAVE_GENERIC */
-#if LV_HAVE_ORC
+#if LV_HAVE_ORC_DISABLED
/*!
\brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
-extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points);
+extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
- volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+ volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
index e063ae432..c2605d551 100644
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -161,7 +161,7 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec
}
#endif /* LV_HAVE_GENERIC */
-#if LV_HAVE_ORC
+#if LV_HAVE_ORC_DISABLED
/*!
\brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h
index 96aafb2bf..d4e30fba8 100644
--- a/volk/include/volk/volk_32f_max_aligned16.h
+++ b/volk/include/volk/volk_32f_max_aligned16.h
@@ -67,5 +67,19 @@ static inline void volk_32f_max_aligned16_generic(float* cVector, const float* a
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h
index e247f4213..55daafb6a 100644
--- a/volk/include/volk/volk_32f_min_aligned16.h
+++ b/volk/include/volk/volk_32f_min_aligned16.h
@@ -67,5 +67,19 @@ static inline void volk_32f_min_aligned16_generic(float* cVector, const float* a
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */
--
cgit
From 5b45b875ed58fd66234764a05da42c6eaff22c4d Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 11 Jan 2011 15:17:55 -0800
Subject: Volk: Added more Orc routines (including complex multiply). Started
redoing the testing framework so it's easier to add new archs to tests.
---
volk/include/volk/volk_32f_normalize_aligned16.h | 15 +++++++++++++++
volk/include/volk/volk_32fc_32f_multiply_aligned16.h | 13 +++++++++++++
volk/include/volk/volk_32fc_multiply_aligned16.h | 17 +++++++++++++++++
3 files changed, 45 insertions(+)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
index 1aabb1d9d..27fb5f7fa 100644
--- a/volk/include/volk/volk_32f_normalize_aligned16.h
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -60,6 +60,21 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points);
+static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
index 436656ca0..304ed8e2d 100644
--- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
+++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
@@ -76,6 +76,19 @@ static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector,
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h
index 6a1649fdb..c8f2418c3 100644
--- a/volk/include/volk/volk_32fc_multiply_aligned16.h
+++ b/volk/include/volk/volk_32fc_multiply_aligned16.h
@@ -4,6 +4,7 @@
#include
#include
#include
+#include
#if LV_HAVE_SSE3
#include
@@ -72,6 +73,22 @@ static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, cons
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_ORC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ static const float mask = -0.0;
+ volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
--
cgit
From c501dc110d3cc7cfcfff178fecb21f30ac9bd54c Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 11 Jan 2011 15:35:04 -0800
Subject: Volk: fixed normalize.
---
volk/include/volk/volk_32f_normalize_aligned16.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
(limited to 'volk/include')
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
index 27fb5f7fa..323d0513c 100644
--- a/volk/include/volk/volk_32f_normalize_aligned16.h
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -68,10 +68,10 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const
\param bVector One of the vectors to be normalizeed
\param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
*/
-extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points);
+extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
float invscalar = 1.0 / scalar;
- volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points);
+ volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
}
#endif /* LV_HAVE_GENERIC */
--
cgit
From d486ff4b4c039c8b3b06b6519839d522cf69be69 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Sun, 16 Jan 2011 14:03:16 -0800
Subject: volk_rename: renamed basically everything in the volk lib to have
logically consistent function names
---
volk/include/volk/Makefile.am | 174 ++++----
volk/include/volk/make_c.py | 3 +-
volk/include/volk/volk_16s_add_quad_a16.h | 136 ++++++
volk/include/volk/volk_16s_add_quad_aligned16.h | 136 ------
volk/include/volk/volk_16s_branch_4_state_8_a16.h | 194 +++++++++
.../volk/volk_16s_branch_4_state_8_aligned16.h | 194 ---------
volk/include/volk/volk_16s_convert_32f_aligned16.h | 119 ------
.../volk/volk_16s_convert_32f_unaligned16.h | 122 ------
volk/include/volk/volk_16s_convert_8s_a16.h | 69 +++
volk/include/volk/volk_16s_convert_8s_aligned16.h | 69 ---
volk/include/volk/volk_16s_convert_8s_ua16.h | 71 ++++
.../include/volk/volk_16s_convert_8s_unaligned16.h | 71 ----
volk/include/volk/volk_16s_max_star_16s_a16.h | 108 +++++
volk/include/volk/volk_16s_max_star_aligned16.h | 108 -----
.../volk/volk_16s_max_star_horizontal_16s_a16.h | 130 ++++++
.../volk/volk_16s_max_star_horizontal_aligned16.h | 130 ------
.../volk/volk_16s_permute_and_scalar_add_a16.h | 139 ++++++
.../volk_16s_permute_and_scalar_add_aligned16.h | 139 ------
volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 +++++++++
.../volk/volk_16s_quad_max_star_aligned16.h | 191 ---------
volk/include/volk/volk_16s_s32f_convert_32f_a16.h | 119 ++++++
volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ++++++
.../volk/volk_16sc_deinterleave_16s_16s_a16.h | 158 +++++++
.../volk/volk_16sc_deinterleave_16s_aligned16.h | 158 -------
.../volk/volk_16sc_deinterleave_32f_aligned16.h | 108 -----
.../volk/volk_16sc_deinterleave_real_16s_a16.h | 120 ++++++
.../volk_16sc_deinterleave_real_16s_aligned16.h | 120 ------
.../volk_16sc_deinterleave_real_32f_aligned16.h | 125 ------
.../volk/volk_16sc_deinterleave_real_8s_a16.h | 94 +++++
.../volk_16sc_deinterleave_real_8s_aligned16.h | 94 -----
volk/include/volk/volk_16sc_magnitude_16s_a16.h | 190 +++++++++
.../volk/volk_16sc_magnitude_16s_aligned16.h | 190 ---------
.../volk/volk_16sc_magnitude_32f_aligned16.h | 179 --------
.../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 +++++
.../volk_16sc_s32f_deinterleave_real_32f_a16.h | 125 ++++++
.../volk/volk_16sc_s32f_magnitude_32f_a16.h | 179 ++++++++
volk/include/volk/volk_16u_byteswap_a16.h | 77 ++++
volk/include/volk/volk_16u_byteswap_aligned16.h | 77 ----
.../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h | 151 +++++++
volk/include/volk/volk_32f_32f_add_32f_a16.h | 81 ++++
volk/include/volk/volk_32f_32f_divide_32f_a16.h | 82 ++++
volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h | 184 ++++++++
volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 ++++++++
.../volk/volk_32f_32f_interleave_32fc_a16.h | 75 ++++
volk/include/volk/volk_32f_32f_max_32f_a16.h | 85 ++++
volk/include/volk/volk_32f_32f_min_32f_a16.h | 85 ++++
volk/include/volk/volk_32f_32f_multiply_32f_a16.h | 81 ++++
.../volk/volk_32f_32f_s32f_interleave_16sc_a16.h | 155 +++++++
volk/include/volk/volk_32f_32f_subtract_32f_a16.h | 81 ++++
volk/include/volk/volk_32f_accumulator_aligned16.h | 67 ---
volk/include/volk/volk_32f_accumulator_s32f_a16.h | 67 +++
volk/include/volk/volk_32f_add_aligned16.h | 81 ----
.../volk/volk_32f_calc_spectral_noise_floor_a16.h | 167 ++++++++
.../volk_32f_calc_spectral_noise_floor_aligned16.h | 167 --------
volk/include/volk/volk_32f_convert_16s_aligned16.h | 110 -----
.../volk/volk_32f_convert_16s_unaligned16.h | 113 -----
volk/include/volk/volk_32f_convert_32s_aligned16.h | 106 -----
.../volk/volk_32f_convert_32s_unaligned16.h | 109 -----
volk/include/volk/volk_32f_convert_64f_a16.h | 70 +++
volk/include/volk/volk_32f_convert_64f_aligned16.h | 70 ---
volk/include/volk/volk_32f_convert_64f_ua16.h | 70 +++
.../volk/volk_32f_convert_64f_unaligned16.h | 70 ---
volk/include/volk/volk_32f_convert_8s_aligned16.h | 117 ------
.../include/volk/volk_32f_convert_8s_unaligned16.h | 120 ------
volk/include/volk/volk_32f_divide_aligned16.h | 82 ----
volk/include/volk/volk_32f_dot_prod_aligned16.h | 184 --------
volk/include/volk/volk_32f_dot_prod_unaligned16.h | 184 --------
volk/include/volk/volk_32f_fm_detect_aligned16.h | 120 ------
volk/include/volk/volk_32f_index_max_16u_a16.h | 148 +++++++
volk/include/volk/volk_32f_index_max_aligned16.h | 148 -------
.../volk/volk_32f_interleave_16sc_aligned16.h | 155 -------
.../volk/volk_32f_interleave_32fc_aligned16.h | 75 ----
volk/include/volk/volk_32f_max_aligned16.h | 85 ----
volk/include/volk/volk_32f_min_aligned16.h | 85 ----
volk/include/volk/volk_32f_multiply_aligned16.h | 81 ----
volk/include/volk/volk_32f_normalize_aligned16.h | 81 ----
volk/include/volk/volk_32f_power_aligned16.h | 144 -------
.../volk/volk_32f_s32f_32f_fm_detect_32f_a16.h | 120 ++++++
volk/include/volk/volk_32f_s32f_convert_16s_a16.h | 110 +++++
volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 +++++
volk/include/volk/volk_32f_s32f_convert_32s_a16.h | 106 +++++
volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 +++++
volk/include/volk/volk_32f_s32f_convert_8s_a16.h | 117 ++++++
volk/include/volk/volk_32f_s32f_convert_8s_ua16.h | 120 ++++++
volk/include/volk/volk_32f_s32f_normalize_a16.h | 81 ++++
volk/include/volk/volk_32f_s32f_power_32f_a16.h | 144 +++++++
volk/include/volk/volk_32f_s32f_stddev_32f_a16.h | 144 +++++++
volk/include/volk/volk_32f_sqrt_32f_a16.h | 77 ++++
volk/include/volk/volk_32f_sqrt_aligned16.h | 77 ----
volk/include/volk/volk_32f_stddev_aligned16.h | 144 -------
.../volk/volk_32f_stddev_and_mean_32f_32f_a16.h | 169 ++++++++
.../volk/volk_32f_stddev_and_mean_aligned16.h | 169 --------
volk/include/volk/volk_32f_subtract_aligned16.h | 81 ----
volk/include/volk/volk_32f_sum_of_poly_aligned16.h | 151 -------
.../include/volk/volk_32fc_32f_multiply_32fc_a16.h | 95 +++++
.../volk/volk_32fc_32f_multiply_aligned16.h | 95 -----
volk/include/volk/volk_32fc_32f_power_32fc_a16.h | 109 +++++
.../volk/volk_32fc_32f_power_32fc_aligned16.h | 109 -----
.../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h | 344 +++++++++++++++
.../volk/volk_32fc_32fc_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++
.../volk/volk_32fc_32fc_multiply_32fc_a16.h | 95 +++++
...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++
.../volk/volk_32fc_32fc_square_dist_32f_a16.h | 112 +++++
volk/include/volk/volk_32fc_atan2_32f_aligned16.h | 158 -------
.../volk/volk_32fc_conjugate_dot_prod_aligned16.h | 344 ---------------
.../volk/volk_32fc_deinterleave_32f_32f_a16.h | 75 ++++
.../volk/volk_32fc_deinterleave_32f_aligned16.h | 75 ----
.../volk/volk_32fc_deinterleave_64f_64f_a16.h | 78 ++++
.../volk/volk_32fc_deinterleave_64f_aligned16.h | 78 ----
.../volk/volk_32fc_deinterleave_real_16s_a16.h | 80 ++++
.../volk_32fc_deinterleave_real_16s_aligned16.h | 80 ----
.../volk/volk_32fc_deinterleave_real_32f_a16.h | 68 +++
.../volk_32fc_deinterleave_real_32f_aligned16.h | 68 ---
.../volk/volk_32fc_deinterleave_real_64f_a16.h | 66 +++
.../volk_32fc_deinterleave_real_64f_aligned16.h | 66 ---
volk/include/volk/volk_32fc_dot_prod_aligned16.h | 468 ---------------------
volk/include/volk/volk_32fc_index_max_16u_a16.h | 215 ++++++++++
volk/include/volk/volk_32fc_index_max_aligned16.h | 215 ----------
.../volk/volk_32fc_magnitude_16s_aligned16.h | 158 -------
volk/include/volk/volk_32fc_magnitude_32f_a16.h | 132 ++++++
.../volk/volk_32fc_magnitude_32f_aligned16.h | 132 ------
volk/include/volk/volk_32fc_multiply_aligned16.h | 95 -----
...olk_32fc_power_spectral_density_32f_aligned16.h | 134 ------
.../volk/volk_32fc_power_spectrum_32f_aligned16.h | 126 ------
volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h | 158 +++++++
.../volk/volk_32fc_s32f_magnitude_16s_a16.h | 158 +++++++
.../volk/volk_32fc_s32f_power_spectrum_32f_a16.h | 126 ++++++
...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ++++++
.../include/volk/volk_32fc_square_dist_aligned16.h | 112 -----
.../volk_32fc_square_dist_scalar_mult_aligned16.h | 126 ------
volk/include/volk/volk_32s_32s_and_32s_a16.h | 81 ++++
volk/include/volk/volk_32s_32s_or_32s_a16.h | 81 ++++
volk/include/volk/volk_32s_and_aligned16.h | 81 ----
volk/include/volk/volk_32s_convert_32f_aligned16.h | 73 ----
.../volk/volk_32s_convert_32f_unaligned16.h | 75 ----
volk/include/volk/volk_32s_or_aligned16.h | 81 ----
volk/include/volk/volk_32s_s32f_convert_32f_a16.h | 73 ++++
volk/include/volk/volk_32s_s32f_convert_32f_ua16.h | 75 ++++
volk/include/volk/volk_32u_byteswap_a16.h | 77 ++++
volk/include/volk/volk_32u_byteswap_aligned16.h | 77 ----
volk/include/volk/volk_32u_popcnt_a16.h | 36 ++
volk/include/volk/volk_32u_popcnt_aligned16.h | 36 --
volk/include/volk/volk_64f_64f_max_64f_a16.h | 71 ++++
volk/include/volk/volk_64f_64f_min_64f_a16.h | 71 ++++
volk/include/volk/volk_64f_convert_32f_a16.h | 67 +++
volk/include/volk/volk_64f_convert_32f_aligned16.h | 67 ---
volk/include/volk/volk_64f_convert_32f_ua16.h | 67 +++
.../volk/volk_64f_convert_32f_unaligned16.h | 67 ---
volk/include/volk/volk_64f_max_aligned16.h | 71 ----
volk/include/volk/volk_64f_min_aligned16.h | 71 ----
volk/include/volk/volk_64u_byteswap_a16.h | 88 ++++
volk/include/volk/volk_64u_byteswap_aligned16.h | 88 ----
volk/include/volk/volk_64u_popcnt_a16.h | 50 +++
volk/include/volk/volk_64u_popcnt_aligned16.h | 50 ---
volk/include/volk/volk_8s_convert_16s_a16.h | 83 ++++
volk/include/volk/volk_8s_convert_16s_aligned16.h | 83 ----
volk/include/volk/volk_8s_convert_16s_ua16.h | 73 ++++
.../include/volk/volk_8s_convert_16s_unaligned16.h | 73 ----
volk/include/volk/volk_8s_convert_32f_aligned16.h | 105 -----
.../include/volk/volk_8s_convert_32f_unaligned16.h | 94 -----
volk/include/volk/volk_8s_s32f_convert_32f_a16.h | 105 +++++
volk/include/volk/volk_8s_s32f_convert_32f_ua16.h | 94 +++++
.../volk_8sc_8sc_multiply_conjugate_16sc_a16.h | 102 +++++
...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++
.../volk/volk_8sc_deinterleave_16s_16s_a16.h | 77 ++++
.../volk/volk_8sc_deinterleave_16s_aligned16.h | 77 ----
.../volk/volk_8sc_deinterleave_32f_aligned16.h | 164 --------
.../volk/volk_8sc_deinterleave_real_16s_a16.h | 66 +++
.../volk_8sc_deinterleave_real_16s_aligned16.h | 66 ---
.../volk_8sc_deinterleave_real_32f_aligned16.h | 133 ------
.../volk/volk_8sc_deinterleave_real_8s_a16.h | 67 +++
.../volk/volk_8sc_deinterleave_real_8s_aligned16.h | 67 ---
.../volk_8sc_multiply_conjugate_16sc_aligned16.h | 102 -----
.../volk_8sc_multiply_conjugate_32fc_aligned16.h | 122 ------
.../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h | 164 ++++++++
.../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ++++++
volk/include/volk/volk_register.py | 5 +-
177 files changed, 10158 insertions(+), 10160 deletions(-)
create mode 100644 volk/include/volk/volk_16s_add_quad_a16.h
delete mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h
create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h
delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
delete mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h
delete mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h
create mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h
delete mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h
create mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h
delete mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h
create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
create mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h
delete mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h
create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h
create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h
delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
create mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
create mode 100644 volk/include/volk/volk_16u_byteswap_a16.h
delete mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
create mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
create mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h
delete mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h
create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h
delete mode 100644 volk/include/volk/volk_32f_add_aligned16.h
create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h
delete mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h
create mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h
delete mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h
delete mode 100644 volk/include/volk/volk_32f_divide_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h
delete mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h
create mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h
delete mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_max_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_min_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_power_aligned16.h
create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h
create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h
delete mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h
create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h
delete mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h
delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h
delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
create mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
create mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
create mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
create mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h
delete mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h
create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h
create mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
delete mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h
delete mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
create mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h
create mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h
delete mode 100644 volk/include/volk/volk_32s_and_aligned16.h
delete mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h
delete mode 100644 volk/include/volk/volk_32s_or_aligned16.h
create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h
create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
create mode 100644 volk/include/volk/volk_32u_byteswap_a16.h
delete mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_32u_popcnt_a16.h
delete mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h
create mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h
create mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h
create mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h
delete mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h
create mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h
delete mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h
delete mode 100644 volk/include/volk/volk_64f_max_aligned16.h
delete mode 100644 volk/include/volk/volk_64f_min_aligned16.h
create mode 100644 volk/include/volk/volk_64u_byteswap_a16.h
delete mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h
create mode 100644 volk/include/volk/volk_64u_popcnt_a16.h
delete mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h
delete mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h
create mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h
delete mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h
delete mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h
delete mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h
create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h
create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
create mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
create mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
(limited to 'volk/include')
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 99276ab87..aef1d7ba8 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -41,93 +41,93 @@ volkinclude_HEADERS = \
volk.h \
volk_cpu.h \
volk_environment_init.h \
- volk_16s_add_quad_aligned16.h \
- volk_16s_branch_4_state_8_aligned16.h \
- volk_16sc_deinterleave_16s_aligned16.h \
- volk_16sc_deinterleave_32f_aligned16.h \
- volk_16sc_deinterleave_real_16s_aligned16.h \
- volk_16sc_deinterleave_real_32f_aligned16.h \
- volk_16sc_deinterleave_real_8s_aligned16.h \
- volk_16sc_magnitude_16s_aligned16.h \
- volk_16sc_magnitude_32f_aligned16.h \
- volk_16s_convert_32f_aligned16.h \
- volk_16s_convert_32f_unaligned16.h \
- volk_16s_convert_8s_aligned16.h \
- volk_16s_convert_8s_unaligned16.h \
- volk_16s_max_star_aligned16.h \
- volk_16s_max_star_horizontal_aligned16.h \
- volk_16s_permute_and_scalar_add_aligned16.h \
- volk_16s_quad_max_star_aligned16.h \
- volk_16u_byteswap_aligned16.h \
- volk_32f_accumulator_aligned16.h \
- volk_32f_add_aligned16.h \
- volk_32fc_32f_multiply_aligned16.h \
- volk_32fc_32f_power_32fc_aligned16.h \
- volk_32f_calc_spectral_noise_floor_aligned16.h \
- volk_32fc_atan2_32f_aligned16.h \
- volk_32fc_conjugate_dot_prod_aligned16.h \
- volk_32fc_deinterleave_32f_aligned16.h \
- volk_32fc_deinterleave_64f_aligned16.h \
- volk_32fc_deinterleave_real_16s_aligned16.h \
- volk_32fc_deinterleave_real_32f_aligned16.h \
- volk_32fc_deinterleave_real_64f_aligned16.h \
- volk_32fc_dot_prod_aligned16.h \
- volk_32fc_index_max_aligned16.h \
- volk_32fc_magnitude_16s_aligned16.h \
- volk_32fc_magnitude_32f_aligned16.h \
- volk_32fc_multiply_aligned16.h \
- volk_32f_convert_16s_aligned16.h \
- volk_32f_convert_16s_unaligned16.h \
- volk_32f_convert_32s_aligned16.h \
- volk_32f_convert_32s_unaligned16.h \
- volk_32f_convert_64f_aligned16.h \
- volk_32f_convert_64f_unaligned16.h \
- volk_32f_convert_8s_aligned16.h \
- volk_32f_convert_8s_unaligned16.h \
- volk_32fc_power_spectral_density_32f_aligned16.h \
- volk_32fc_power_spectrum_32f_aligned16.h \
- volk_32fc_square_dist_aligned16.h \
- volk_32fc_square_dist_scalar_mult_aligned16.h \
- volk_32f_divide_aligned16.h \
- volk_32f_dot_prod_aligned16.h \
- volk_32f_dot_prod_unaligned16.h \
- volk_32f_fm_detect_aligned16.h \
- volk_32f_index_max_aligned16.h \
- volk_32f_interleave_16sc_aligned16.h \
- volk_32f_interleave_32fc_aligned16.h \
- volk_32f_max_aligned16.h \
- volk_32f_min_aligned16.h \
- volk_32f_multiply_aligned16.h \
- volk_32f_normalize_aligned16.h \
- volk_32f_power_aligned16.h \
- volk_32f_sqrt_aligned16.h \
- volk_32f_stddev_aligned16.h \
- volk_32f_stddev_and_mean_aligned16.h \
- volk_32f_subtract_aligned16.h \
- volk_32f_sum_of_poly_aligned16.h \
- volk_32s_and_aligned16.h \
- volk_32s_convert_32f_aligned16.h \
- volk_32s_convert_32f_unaligned16.h \
- volk_32s_or_aligned16.h \
- volk_32u_byteswap_aligned16.h \
- volk_32u_popcnt_aligned16.h \
- volk_64f_convert_32f_aligned16.h \
- volk_64f_convert_32f_unaligned16.h \
- volk_64f_max_aligned16.h \
- volk_64f_min_aligned16.h \
- volk_64u_byteswap_aligned16.h \
- volk_64u_popcnt_aligned16.h \
- volk_8sc_deinterleave_16s_aligned16.h \
- volk_8sc_deinterleave_32f_aligned16.h \
- volk_8sc_deinterleave_real_16s_aligned16.h \
- volk_8sc_deinterleave_real_32f_aligned16.h \
- volk_8sc_deinterleave_real_8s_aligned16.h \
- volk_8sc_multiply_conjugate_16sc_aligned16.h \
- volk_8sc_multiply_conjugate_32fc_aligned16.h \
- volk_8s_convert_16s_aligned16.h \
- volk_8s_convert_16s_unaligned16.h \
- volk_8s_convert_32f_aligned16.h \
- volk_8s_convert_32f_unaligned16.h
+ volk_16s_add_quad_a16.h \
+ volk_16s_branch_4_state_8_a16.h \
+ volk_16sc_deinterleave_16s_16s_a16.h \
+ volk_16sc_s32f_deinterleave_32f_32f_a16.h \
+ volk_16sc_deinterleave_real_16s_a16.h \
+ volk_16sc_s32f_deinterleave_real_32f_a16.h \
+ volk_16sc_deinterleave_real_8s_a16.h \
+ volk_16sc_magnitude_16s_a16.h \
+ volk_16sc_s32f_magnitude_32f_a16.h \
+ volk_16s_s32f_convert_32f_a16.h \
+ volk_16s_s32f_convert_32f_ua16.h \
+ volk_16s_convert_8s_a16.h \
+ volk_16s_convert_8s_ua16.h \
+ volk_16s_max_star_16s_a16.h \
+ volk_16s_max_star_horizontal_16s_a16.h \
+ volk_16s_permute_and_scalar_add_a16.h \
+ volk_16s_quad_max_star_16s_a16.h \
+ volk_16u_byteswap_a16.h \
+ volk_32f_accumulator_s32f_a16.h \
+ volk_32f_32f_add_32f_a16.h \
+ volk_32fc_32f_multiply_32fc_a16.h \
+ volk_32fc_32f_power_32fc_a16.h \
+ volk_32f_calc_spectral_noise_floor_a16.h \
+ volk_32fc_s32f_atan2_32f_a16.h \
+ volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \
+ volk_32fc_deinterleave_32f_32f_a16.h \
+ volk_32fc_deinterleave_64f_64f_a16.h \
+ volk_32fc_deinterleave_real_16s_a16.h \
+ volk_32fc_deinterleave_real_32f_a16.h \
+ volk_32fc_deinterleave_real_64f_a16.h \
+ volk_32fc_32fc_dot_prod_32fc_a16.h \
+ volk_32fc_index_max_16u_a16.h \
+ volk_32fc_s32f_magnitude_16s_a16.h \
+ volk_32fc_magnitude_32f_a16.h \
+ volk_32fc_32fc_multiply_32fc_a16.h \
+ volk_32f_s32f_convert_16s_a16.h \
+ volk_32f_s32f_convert_16s_ua16.h \
+ volk_32f_s32f_convert_32s_a16.h \
+ volk_32f_s32f_convert_32s_ua16.h \
+ volk_32f_convert_64f_a16.h \
+ volk_32f_convert_64f_ua16.h \
+ volk_32f_s32f_convert_8s_a16.h \
+ volk_32f_s32f_convert_8s_ua16.h \
+ volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \
+ volk_32fc_s32f_power_spectrum_32f_a16.h \
+ volk_32fc_32fc_square_dist_32f_a16.h \
+ volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \
+ volk_32f_32f_divide_32f_a16.h \
+ volk_32f_32f_dot_prod_32f_a16.h \
+ volk_32f_32f_dot_prod_32f_ua16.h \
+ volk_32f_s32f_32f_fm_detect_32f_a16.h \
+ volk_32f_index_max_16u_a16.h \
+ volk_32f_32f_s32f_interleave_16sc_a16.h \
+ volk_32f_32f_interleave_32fc_a16.h \
+ volk_32f_32f_max_32f_a16.h \
+ volk_32f_32f_min_32f_a16.h \
+ volk_32f_32f_multiply_32f_a16.h \
+ volk_32f_s32f_normalize_a16.h \
+ volk_32f_s32f_power_32f_a16.h \
+ volk_32f_sqrt_32f_a16.h \
+ volk_32f_s32f_stddev_32f_a16.h \
+ volk_32f_stddev_and_mean_32f_32f_a16.h \
+ volk_32f_32f_subtract_32f_a16.h \
+ volk_32f_32f_32f_sum_of_poly_32f_a16.h \
+ volk_32s_32s_and_32s_a16.h \
+ volk_32s_s32f_convert_32f_a16.h \
+ volk_32s_s32f_convert_32f_ua16.h \
+ volk_32s_32s_or_32s_a16.h \
+ volk_32u_byteswap_a16.h \
+ volk_32u_popcnt_a16.h \
+ volk_64f_convert_32f_a16.h \
+ volk_64f_convert_32f_ua16.h \
+ volk_64f_64f_max_64f_a16.h \
+ volk_64f_64f_min_64f_a16.h \
+ volk_64u_byteswap_a16.h \
+ volk_64u_popcnt_a16.h \
+ volk_8sc_deinterleave_16s_16s_a16.h \
+ volk_8sc_s32f_deinterleave_32f_32f_a16.h \
+ volk_8sc_deinterleave_real_16s_a16.h \
+ volk_8sc_s32f_deinterleave_real_32f_a16.h \
+ volk_8sc_deinterleave_real_8s_a16.h \
+ volk_8sc_8sc_multiply_conjugate_16sc_a16.h \
+ volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \
+ volk_8s_convert_16s_a16.h \
+ volk_8s_convert_16s_ua16.h \
+ volk_8s_s32f_convert_32f_a16.h \
+ volk_8s_s32f_convert_32f_ua16.h
VOLK_MKTABLES_SOURCES = \
$(top_srcdir)/lib/volk_rank_archs.c \
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
index f2432d7a4..f708ba7d0 100644
--- a/volk/include/volk/make_c.py
+++ b/volk/include/volk/make_c.py
@@ -24,8 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
tempstring = tempstring + " }\n"
tempstring = tempstring + " return 0;\n"
tempstring = tempstring + "}\n"
-
-
+
for i in range(len(funclist)):
tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h
new file mode 100644
index 000000000..67d0c55a3
--- /dev/null
+++ b/volk/include/volk/volk_16s_add_quad_a16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_volk_16s_add_quad_a16_H
+#define INCLUDED_volk_16s_add_quad_a16_H
+
+
+#include
+#include
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include
+#include
+
+static inline void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*asm volatile
+ (
+ ".%=volk_16s_add_quad_a16_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16s_add_quad_a16_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t"
+ ".%=volk_16s_add_quad_a16_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+
+ */
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16s_add_quad_a16_H*/
diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h
deleted file mode 100644
index 63042bef1..000000000
--- a/volk/include/volk/volk_16s_add_quad_aligned16.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
-#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
-
-
-#include
-#include
-
-
-
-
-
-#if LV_HAVE_SSE2
-#include
-#include
-
-static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
- __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
- p_target0 = (__m128i*)target0;
- p_target1 = (__m128i*)target1;
- p_target2 = (__m128i*)target2;
- p_target3 = (__m128i*)target3;
-
- p_src0 = (__m128i*)src0;
- p_src1 = (__m128i*)src1;
- p_src2 = (__m128i*)src2;
- p_src3 = (__m128i*)src3;
- p_src4 = (__m128i*)src4;
-
- int i = 0;
-
- int bound = (num_bytes >> 4);
- int leftovers = (num_bytes >> 1) & 7;
-
- for(; i < bound; ++i) {
- xmm0 = _mm_load_si128(p_src0);
- xmm1 = _mm_load_si128(p_src1);
- xmm2 = _mm_load_si128(p_src2);
- xmm3 = _mm_load_si128(p_src3);
- xmm4 = _mm_load_si128(p_src4);
-
- p_src0 += 1;
- p_src1 += 1;
-
- xmm1 = _mm_add_epi16(xmm0, xmm1);
- xmm2 = _mm_add_epi16(xmm0, xmm2);
- xmm3 = _mm_add_epi16(xmm0, xmm3);
- xmm4 = _mm_add_epi16(xmm0, xmm4);
-
-
- p_src2 += 1;
- p_src3 += 1;
- p_src4 += 1;
-
- _mm_store_si128(p_target0, xmm1);
- _mm_store_si128(p_target1, xmm2);
- _mm_store_si128(p_target2, xmm3);
- _mm_store_si128(p_target3, xmm4);
-
- p_target0 += 1;
- p_target1 += 1;
- p_target2 += 1;
- p_target3 += 1;
- }
- /*asm volatile
- (
- ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t"
- "cmp $0, %[bound]\n\t"
- "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t"
- "movaps (%[src0]), %%xmm1\n\t"
- "movaps (%[src1]), %%xmm2\n\t"
- "movaps (%[src2]), %%xmm3\n\t"
- "movaps (%[src3]), %%xmm4\n\t"
- "movaps (%[src4]), %%xmm5\n\t"
- "add $16, %[src0]\n\t"
- "add $16, %[src1]\n\t"
- "add $16, %[src2]\n\t"
- "add $16, %[src3]\n\t"
- "add $16, %[src4]\n\t"
- "paddw %%xmm1, %%xmm2\n\t"
- "paddw %%xmm1, %%xmm3\n\t"
- "paddw %%xmm1, %%xmm4\n\t"
- "paddw %%xmm1, %%xmm5\n\t"
- "add $-1, %[bound]\n\t"
- "movaps %%xmm2, (%[target0])\n\t"
- "movaps %%xmm3, (%[target1])\n\t"
- "movaps %%xmm4, (%[target2])\n\t"
- "movaps %%xmm5, (%[target3])\n\t"
- "add $16, %[target0]\n\t"
- "add $16, %[target1]\n\t"
- "add $16, %[target2]\n\t"
- "add $16, %[target3]\n\t"
- "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t"
- ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t"
- :
- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
- :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-
- */
-
-
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- target0[i] = src0[i] + src1[i];
- target1[i] = src0[i] + src2[i];
- target2[i] = src0[i] + src3[i];
- target3[i] = src0[i] + src4[i];
- }
-}
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-
- int i = 0;
-
- int bound = num_bytes >> 1;
-
- for(i = 0; i < bound; ++i) {
- target0[i] = src0[i] + src1[i];
- target1[i] = src0[i] + src2[i];
- target2[i] = src0[i] + src3[i];
- target3[i] = src0[i] + src4[i];
- }
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-
-#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
new file mode 100644
index 000000000..4c1af8729
--- /dev/null
+++ b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H
+#define INCLUDED_volk_16s_branch_4_state_8_a16_H
+
+
+#include
+#include
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = 1;
+
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm5 = _mm_load_si128(p_src0);
+
+
+
+
+
+
+
+
+
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+ p_src0 += 4;
+
+
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
+
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
+
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+ _mm_store_si128(&p_target[2], xmm8);
+
+ _mm_store_si128(&p_target[3], xmm10);
+
+ p_target += 3;
+ }
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+ int i = 0;
+
+ int bound = 4;
+
+ for(; i < bound; ++i) {
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8] & scalars[2])
+ + (cntl3[i * 8] & scalars[3]);
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 1] & scalars[2])
+ + (cntl3[i * 8 + 1] & scalars[3]);
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 2] & scalars[2])
+ + (cntl3[i * 8 + 2] & scalars[3]);
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 3] & scalars[2])
+ + (cntl3[i * 8 + 3] & scalars[3]);
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 4] & scalars[2])
+ + (cntl3[i * 8 + 4] & scalars[3]);
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 5] & scalars[2])
+ + (cntl3[i * 8 + 5] & scalars[3]);
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 6] & scalars[2])
+ + (cntl3[i * 8 + 6] & scalars[3]);
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 7] & scalars[2])
+ + (cntl3[i * 8 + 7] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
deleted file mode 100644
index fb9d7cb87..000000000
--- a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
-#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
-
-
-#include
-#include
-
-
-
-
-#if LV_HAVE_SSSE3
-
-#include
-#include
-#include
-
-static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-
-
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
-
- __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
-
-
-
- p_target = (__m128i*)target;
- p_src0 = (__m128i*)src0;
- p_cntl2 = (__m128i*)cntl2;
- p_cntl3 = (__m128i*)cntl3;
- p_scalars = (__m128i*)scalars;
-
- int i = 0;
-
- int bound = 1;
-
-
- xmm0 = _mm_load_si128(p_scalars);
-
- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-
- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
- xmm0 = _mm_load_si128((__m128i*)permuters[0]);
- xmm6 = _mm_load_si128((__m128i*)permuters[1]);
- xmm8 = _mm_load_si128((__m128i*)permuters[2]);
- xmm10 = _mm_load_si128((__m128i*)permuters[3]);
-
- for(; i < bound; ++i) {
-
- xmm5 = _mm_load_si128(p_src0);
-
-
-
-
-
-
-
-
-
- xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
- xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
- xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
- xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
-
- p_src0 += 4;
-
-
- xmm5 = _mm_add_epi16(xmm1, xmm2);
-
- xmm6 = _mm_add_epi16(xmm2, xmm6);
- xmm8 = _mm_add_epi16(xmm1, xmm8);
-
-
- xmm7 = _mm_load_si128(p_cntl2);
- xmm9 = _mm_load_si128(p_cntl3);
-
- xmm0 = _mm_add_epi16(xmm5, xmm0);
-
-
- xmm7 = _mm_and_si128(xmm7, xmm3);
- xmm9 = _mm_and_si128(xmm9, xmm4);
-
- xmm5 = _mm_load_si128(&p_cntl2[1]);
- xmm11 = _mm_load_si128(&p_cntl3[1]);
-
- xmm7 = _mm_add_epi16(xmm7, xmm9);
-
- xmm5 = _mm_and_si128(xmm5, xmm3);
- xmm11 = _mm_and_si128(xmm11, xmm4);
-
- xmm0 = _mm_add_epi16(xmm0, xmm7);
-
-
-
- xmm7 = _mm_load_si128(&p_cntl2[2]);
- xmm9 = _mm_load_si128(&p_cntl3[2]);
-
- xmm5 = _mm_add_epi16(xmm5, xmm11);
-
- xmm7 = _mm_and_si128(xmm7, xmm3);
- xmm9 = _mm_and_si128(xmm9, xmm4);
-
- xmm6 = _mm_add_epi16(xmm6, xmm5);
-
-
- xmm5 = _mm_load_si128(&p_cntl2[3]);
- xmm11 = _mm_load_si128(&p_cntl3[3]);
-
- xmm7 = _mm_add_epi16(xmm7, xmm9);
-
- xmm5 = _mm_and_si128(xmm5, xmm3);
- xmm11 = _mm_and_si128(xmm11, xmm4);
-
- xmm8 = _mm_add_epi16(xmm8, xmm7);
-
- xmm5 = _mm_add_epi16(xmm5, xmm11);
-
- _mm_store_si128(p_target, xmm0);
- _mm_store_si128(&p_target[1], xmm6);
-
- xmm10 = _mm_add_epi16(xmm5, xmm10);
-
- _mm_store_si128(&p_target[2], xmm8);
-
- _mm_store_si128(&p_target[3], xmm10);
-
- p_target += 3;
- }
-}
-
-
-#endif /*LV_HAVE_SSEs*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
- int i = 0;
-
- int bound = 4;
-
- for(; i < bound; ++i) {
- target[i* 8] = src0[((char)permuters[i][0])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8] & scalars[2])
- + (cntl3[i * 8] & scalars[3]);
- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 1] & scalars[2])
- + (cntl3[i * 8 + 1] & scalars[3]);
- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 2] & scalars[2])
- + (cntl3[i * 8 + 2] & scalars[3]);
- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 3] & scalars[2])
- + (cntl3[i * 8 + 3] & scalars[3]);
- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 4] & scalars[2])
- + (cntl3[i * 8 + 4] & scalars[3]);
- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 5] & scalars[2])
- + (cntl3[i * 8 + 5] & scalars[3]);
- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 6] & scalars[2])
- + (cntl3[i * 8 + 6] & scalars[3]);
- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 7] & scalars[2])
- + (cntl3[i * 8 + 7] & scalars[3]);
-
- }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h
deleted file mode 100644
index 126ce1528..000000000
--- a/volk/include/volk/volk_16s_convert_32f_aligned16.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE4_1
-#include
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m128i inputVal2;
- __m128 ret;
-
- for(;number < eighthPoints; number++){
-
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
-
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- outputVectorPtr += 4;
-
- inputPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- inputPtr += 4;
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
deleted file mode 100644
index d6212fba5..000000000
--- a/volk/include/volk/volk_16s_convert_32f_unaligned16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE4_1
-#include
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m128i inputVal2;
- __m128 ret;
-
- for(;number < eighthPoints; number++){
-
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
-
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- outputVectorPtr += 4;
-
- inputPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- inputPtr += 4;
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h
new file mode 100644
index 000000000..13db435de
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_a16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_volk_16s_convert_8s_a16_H
+#define INCLUDED_volk_16s_convert_8s_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h
deleted file mode 100644
index 64c368688..000000000
--- a/volk/include/volk/volk_16s_convert_8s_aligned16.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal1;
- __m128i inputVal2;
- __m128i ret;
-
- for(;number < sixteenthPoints; number++){
-
- // Load the 16 values
- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
- inputVal1 = _mm_srai_epi16(inputVal1, 8);
- inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
- ret = _mm_packs_epi16(inputVal1, inputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h
new file mode 100644
index 000000000..9941118ae
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_ua16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_16s_convert_8s_ua16_H
+#define INCLUDED_volk_16s_convert_8s_ua16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
deleted file mode 100644
index ca925de86..000000000
--- a/volk/include/volk/volk_16s_convert_8s_unaligned16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
- \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal1;
- __m128i inputVal2;
- __m128i ret;
-
- for(;number < sixteenthPoints; number++){
-
- // Load the 16 values
- inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
- inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-
- inputVal1 = _mm_srai_epi16(inputVal1, 8);
- inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
- ret = _mm_packs_epi16(inputVal1, inputVal2);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
-
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
- \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h
new file mode 100644
index 000000000..b2ec90552
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_16s_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16s_max_star_16s_a16_H
+#define INCLUDED_volk_16s_max_star_16s_a16_H
+
+
+#include
+#include
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+
+ __m128i *p_src0;
+
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+
+ for(i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+
+
+
+
+
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+
+
+ }
+
+ _mm_store_si128((__m128i*)cands, xmm0);
+
+ for(i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+ }
+
+ target[0] = candidate;
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short candidate = src0[0];
+ for(i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
+
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h
deleted file mode 100644
index ba4e979ec..000000000
--- a/volk/include/volk/volk_16s_max_star_aligned16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
-#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
-
-
-#include
-#include
-
-
-#if LV_HAVE_SSSE3
-
-#include
-#include
-#include
-
-static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
-
-
- short candidate = src0[0];
- short cands[8];
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-
-
- __m128i *p_src0;
-
- p_src0 = (__m128i*)src0;
-
- int bound = num_bytes >> 4;
- int leftovers = (num_bytes >> 1) & 7;
-
- int i = 0;
-
-
- xmm1 = _mm_setzero_si128();
- xmm0 = _mm_setzero_si128();
- //_mm_insert_epi16(xmm0, candidate, 0);
-
- xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
-
-
- for(i = 0; i < bound; ++i) {
- xmm1 = _mm_load_si128(p_src0);
- p_src0 += 1;
- xmm2 = _mm_sub_epi16(xmm1, xmm0);
-
-
-
-
-
-
- xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
- xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
- xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
-
- xmm6 = _mm_xor_si128(xmm4, xmm5);
-
- xmm3 = _mm_and_si128(xmm3, xmm0);
- xmm4 = _mm_and_si128(xmm6, xmm1);
-
- xmm0 = _mm_add_epi16(xmm3, xmm4);
-
-
- }
-
- _mm_store_si128((__m128i*)cands, xmm0);
-
- for(i = 0; i < 8; ++i) {
- candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
- }
-
-
-
- for(i = 0; i < leftovers; ++i) {
-
- candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
- }
-
- target[0] = candidate;
-
-
-
-
-
-}
-
-#endif /*LV_HAVE_SSSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) {
-
- int i = 0;
-
- int bound = num_bytes >> 1;
-
- short candidate = src0[0];
- for(i = 1; i < bound; ++i) {
- candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
- }
- target[0] = candidate;
-
-}
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
new file mode 100644
index 000000000..68994593b
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
+#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
+
+
+#include
+#include
+
+
+#if LV_HAVE_SSSE3
+
+#include
+#include
+#include
+
+static inline void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+ const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+
+
+ volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
+
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+ __m128i *p_target, *p_src0;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ for(i = 0; i < bound; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
+
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+ _mm_store_si128(p_target, xmm3);
+
+ p_target += 1;
+
+ }
+
+ for(i = 0; i < intermediate; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+
+ _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+
+ }
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+ target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+
+ for(i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+ }
+
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
deleted file mode 100644
index 82d011677..000000000
--- a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
-#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
-
-
-#include
-#include
-
-
-#if LV_HAVE_SSSE3
-
-#include
-#include
-#include
-
-static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
- const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
- const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
- const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
- const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
-
-
-
- volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
- __m128i xmm5, xmm6, xmm7, xmm8;
-
- xmm4 = _mm_load_si128((__m128i*)shufmask0);
- xmm5 = _mm_load_si128((__m128i*)shufmask1);
- xmm6 = _mm_load_si128((__m128i*)andmask0);
- xmm7 = _mm_load_si128((__m128i*)andmask1);
-
- __m128i *p_target, *p_src0;
-
- p_target = (__m128i*)target;
- p_src0 = (__m128i*)src0;
-
- int bound = num_bytes >> 5;
- int intermediate = (num_bytes >> 4) & 1;
- int leftovers = (num_bytes >> 1) & 7;
-
- int i = 0;
-
-
- for(i = 0; i < bound; ++i) {
-
- xmm0 = _mm_load_si128(p_src0);
- xmm1 = _mm_load_si128(&p_src0[1]);
-
-
-
- xmm2 = _mm_xor_si128(xmm2, xmm2);
- p_src0 += 2;
-
- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-
- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
-
- xmm8 = _mm_and_si128(xmm2, xmm6);
- xmm3 = _mm_and_si128(xmm2, xmm7);
-
-
- xmm8 = _mm_add_epi8(xmm8, xmm4);
- xmm3 = _mm_add_epi8(xmm3, xmm5);
-
- xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
- xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
-
-
- xmm3 = _mm_add_epi16(xmm0, xmm1);
-
-
- _mm_store_si128(p_target, xmm3);
-
- p_target += 1;
-
- }
-
- for(i = 0; i < intermediate; ++i) {
-
- xmm0 = _mm_load_si128(p_src0);
-
-
- xmm2 = _mm_xor_si128(xmm2, xmm2);
- p_src0 += 1;
-
- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
-
- xmm8 = _mm_and_si128(xmm2, xmm6);
-
- xmm3 = _mm_add_epi8(xmm8, xmm4);
-
- xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-
-
- _mm_storel_pd((double*)p_target, (__m128d)xmm0);
-
- p_target = (__m128i*)((int8_t*)p_target + 8);
-
- }
-
- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
- target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
- }
-
-
-}
-
-#endif /*LV_HAVE_SSSE3*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
- int i = 0;
-
- int bound = num_bytes >> 1;
-
-
- for(i = 0; i < bound; i += 2) {
- target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
- }
-
-}
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
new file mode 100644
index 000000000..2e7586b57
--- /dev/null
+++ b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H
+#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H
+
+
+#include
+#include
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include
+#include
+
+static inline void volk_16s_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+ short* p_permute_indexes = permute_indexes;
+
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
+
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_permute_indexes += 8;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
+
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ xmm5 = _mm_load_si128(p_cntl3);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_cntl0 += 1;
+
+ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ p_cntl1 += 1;
+ p_cntl2 += 1;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ p_cntl3 += 1;
+
+ _mm_store_si128(p_target, xmm0);
+
+ p_target += 1;
+ }
+
+
+
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
deleted file mode 100644
index 452d05c4f..000000000
--- a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
-#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
-
-
-#include
-#include
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include
-#include
-
-static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-
-
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
-
- short* p_permute_indexes = permute_indexes;
-
- p_target = (__m128i*)target;
- p_cntl0 = (__m128i*)cntl0;
- p_cntl1 = (__m128i*)cntl1;
- p_cntl2 = (__m128i*)cntl2;
- p_cntl3 = (__m128i*)cntl3;
- p_scalars = (__m128i*)scalars;
-
- int i = 0;
-
- int bound = (num_bytes >> 4);
- int leftovers = (num_bytes >> 1) & 7;
-
- xmm0 = _mm_load_si128(p_scalars);
-
- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-
- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-
- for(; i < bound; ++i) {
- xmm0 = _mm_setzero_si128();
- xmm5 = _mm_setzero_si128();
- xmm6 = _mm_setzero_si128();
- xmm7 = _mm_setzero_si128();
-
- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
-
- xmm0 = _mm_add_epi16(xmm0, xmm5);
- xmm6 = _mm_add_epi16(xmm6, xmm7);
-
- p_permute_indexes += 8;
-
- xmm0 = _mm_add_epi16(xmm0, xmm6);
-
- xmm5 = _mm_load_si128(p_cntl0);
- xmm6 = _mm_load_si128(p_cntl1);
- xmm7 = _mm_load_si128(p_cntl2);
-
- xmm5 = _mm_and_si128(xmm5, xmm1);
- xmm6 = _mm_and_si128(xmm6, xmm2);
- xmm7 = _mm_and_si128(xmm7, xmm3);
-
- xmm0 = _mm_add_epi16(xmm0, xmm5);
-
- xmm5 = _mm_load_si128(p_cntl3);
-
- xmm6 = _mm_add_epi16(xmm6, xmm7);
-
- p_cntl0 += 1;
-
- xmm5 = _mm_and_si128(xmm5, xmm4);
-
- xmm0 = _mm_add_epi16(xmm0, xmm6);
-
- p_cntl1 += 1;
- p_cntl2 += 1;
-
- xmm0 = _mm_add_epi16(xmm0, xmm5);
-
- p_cntl3 += 1;
-
- _mm_store_si128(p_target, xmm0);
-
- p_target += 1;
- }
-
-
-
-
-
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- target[i] = src0[permute_indexes[i]]
- + (cntl0[i] & scalars[0])
- + (cntl1[i] & scalars[1])
- + (cntl2[i] & scalars[2])
- + (cntl3[i] & scalars[3]);
- }
-}
-#endif /*LV_HAVE_SSEs*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-
- int i = 0;
-
- int bound = num_bytes >> 1;
-
- for(i = 0; i < bound; ++i) {
- target[i] = src0[permute_indexes[i]]
- + (cntl0[i] & scalars[0])
- + (cntl1[i] & scalars[1])
- + (cntl2[i] & scalars[2])
- + (cntl3[i] & scalars[3]);
-
- }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
new file mode 100644
index 000000000..3e89ff963
--- /dev/null
+++ b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H
+#define INCLUDED_volk_16s_quad_max_star_16s_a16_H
+
+
+#include
+#include
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include
+
+static inline void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+
+
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*) target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+
+
+
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ while(bound_copy > 0) {
+
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
+
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
+
+
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+
+
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+
+
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
+
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
+
+ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
+
+
+
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
+
+
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+
+ }
+
+
+ /*asm volatile
+ (
+ "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t"
+
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
+
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
+
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
+
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
+
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
+
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
+
+ "pand %%xmm1, %%xmm6\n\t"
+
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
+
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
+
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t"
+
+ "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+ return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
deleted file mode 100644
index 1004c4d23..000000000
--- a/volk/include/volk/volk_16s_quad_max_star_aligned16.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
-#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
-
-
-#include
-#include
-
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include
-
-static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-
-
-
-
- int i = 0;
-
- int bound = (num_bytes >> 4);
- int bound_copy = bound;
- int leftovers = (num_bytes >> 1) & 7;
-
- __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
- p_target = (__m128i*) target;
- p_src0 = (__m128i*)src0;
- p_src1 = (__m128i*)src1;
- p_src2 = (__m128i*)src2;
- p_src3 = (__m128i*)src3;
-
-
-
- __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
- while(bound_copy > 0) {
-
- xmm1 = _mm_load_si128(p_src0);
- xmm2 = _mm_load_si128(p_src1);
- xmm3 = _mm_load_si128(p_src2);
- xmm4 = _mm_load_si128(p_src3);
-
- xmm5 = _mm_setzero_si128();
- xmm6 = _mm_setzero_si128();
- xmm7 = xmm1;
- xmm8 = xmm3;
-
-
- xmm1 = _mm_sub_epi16(xmm2, xmm1);
-
-
-
- xmm3 = _mm_sub_epi16(xmm4, xmm3);
-
- xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
- xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
-
-
-
- xmm2 = _mm_and_si128(xmm5, xmm2);
- xmm4 = _mm_and_si128(xmm6, xmm4);
- xmm5 = _mm_andnot_si128(xmm5, xmm7);
- xmm6 = _mm_andnot_si128(xmm6, xmm8);
-
- xmm5 = _mm_add_epi16(xmm2, xmm5);
- xmm6 = _mm_add_epi16(xmm4, xmm6);
-
-
- xmm1 = _mm_xor_si128(xmm1, xmm1);
- xmm2 = xmm5;
- xmm5 = _mm_sub_epi16(xmm6, xmm5);
- p_src0 += 1;
- bound_copy -= 1;
-
- xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
- p_src1 += 1;
-
- xmm6 = _mm_and_si128(xmm1, xmm6);
-
- xmm1 = _mm_andnot_si128(xmm1, xmm2);
- p_src2 += 1;
-
-
-
- xmm1 = _mm_add_epi16(xmm6, xmm1);
- p_src3 += 1;
-
-
- _mm_store_si128(p_target, xmm1);
- p_target += 1;
-
- }
-
-
- /*asm volatile
- (
- "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t"
- "cmp $0, %[bound]\n\t"
- "je volk_16s_quad_max_star_aligned16_sse2_END\n\t"
-
- "movaps (%[src0]), %%xmm1\n\t"
- "movaps (%[src1]), %%xmm2\n\t"
- "movaps (%[src2]), %%xmm3\n\t"
- "movaps (%[src3]), %%xmm4\n\t"
-
- "pxor %%xmm5, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm6\n\t"
- "movaps %%xmm1, %%xmm7\n\t"
- "movaps %%xmm3, %%xmm8\n\t"
- "psubw %%xmm2, %%xmm1\n\t"
- "psubw %%xmm4, %%xmm3\n\t"
-
- "pcmpgtw %%xmm1, %%xmm5\n\t"
- "pcmpgtw %%xmm3, %%xmm6\n\t"
-
- "pand %%xmm5, %%xmm2\n\t"
- "pand %%xmm6, %%xmm4\n\t"
- "pandn %%xmm7, %%xmm5\n\t"
- "pandn %%xmm8, %%xmm6\n\t"
-
- "paddw %%xmm2, %%xmm5\n\t"
- "paddw %%xmm4, %%xmm6\n\t"
-
- "pxor %%xmm1, %%xmm1\n\t"
- "movaps %%xmm5, %%xmm2\n\t"
-
- "psubw %%xmm6, %%xmm5\n\t"
- "add $16, %[src0]\n\t"
- "add $-1, %[bound]\n\t"
-
- "pcmpgtw %%xmm5, %%xmm1\n\t"
- "add $16, %[src1]\n\t"
-
- "pand %%xmm1, %%xmm6\n\t"
-
- "pandn %%xmm2, %%xmm1\n\t"
- "add $16, %[src2]\n\t"
-
- "paddw %%xmm6, %%xmm1\n\t"
- "add $16, %[src3]\n\t"
-
- "movaps %%xmm1, (%[target])\n\t"
- "addw $16, %[target]\n\t"
- "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t"
-
- "volk_16s_quad_max_star_aligned16_sse2_END:\n\t"
- :
- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
- :
- );
- */
-
- short temp0 = 0;
- short temp1 = 0;
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
- }
- return;
-
-
-}
-
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-
- int i = 0;
-
- int bound = num_bytes >> 1;
-
- short temp0 = 0;
- short temp1 = 0;
- for(i = 0; i < bound; ++i) {
- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
- }
-}
-
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
new file mode 100644
index 000000000..8f9b44478
--- /dev/null
+++ b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H
+#define INCLUDED_volk_16s_s32f_convert_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
new file mode 100644
index 000000000..ad52aea1a
--- /dev/null
+++ b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H
+#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
new file mode 100644
index 000000000..8e5da24ec
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
deleted file mode 100644
index cf94a3f38..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSSE3
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
-
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
- __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-
- iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
- qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = *int16ComplexVectorPtr++;
- *qBufferPtr++ = *int16ComplexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_SSE2
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
-
- iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
- iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
- iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
- iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
- iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
-
- iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
- iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
- qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
-
- qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
-
- qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
- qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
- qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
- qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
- qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
-
- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Deinterleaves the complex 16 bit vector into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
- \brief Deinterleaves the complex 16 bit vector into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
deleted file mode 100644
index 50b8b62d5..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex data values to be deinterleaved
- */
-static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- uint64_t number = 0;
- const uint64_t quarterPoints = num_points / 4;
- __m128 cplxValue1, cplxValue2, iValue, qValue;
-
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
-
- float floatBuffer[8] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
-
- floatBuffer[0] = (float)(complexVectorPtr[0]);
- floatBuffer[1] = (float)(complexVectorPtr[1]);
- floatBuffer[2] = (float)(complexVectorPtr[2]);
- floatBuffer[3] = (float)(complexVectorPtr[3]);
-
- floatBuffer[4] = (float)(complexVectorPtr[4]);
- floatBuffer[5] = (float)(complexVectorPtr[5]);
- floatBuffer[6] = (float)(complexVectorPtr[6]);
- floatBuffer[7] = (float)(complexVectorPtr[7]);
-
- cplxValue1 = _mm_load_ps(&floatBuffer[0]);
- cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
- complexVectorPtr += 8;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- _mm_store_ps(iBufferPtr, iValue);
- _mm_store_ps(qBufferPtr, qValue);
-
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
- /*!
- \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex data values to be deinterleaved
- */
-static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
- /*!
- \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex data values to be deinterleaved
- */
-extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
new file mode 100644
index 000000000..068c1350c
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
deleted file mode 100644
index b594c85b8..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSSE3
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
-
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m128i complexVal1, complexVal2, iOutputVal;
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
-
- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
- iOutputVal = _mm_or_si128(complexVal1, complexVal2);
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-
-#if LV_HAVE_SSE2
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m128i complexVal1, complexVal2, iOutputVal;
- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
-
- complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
- complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
- complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
- complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
- complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
- complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
- iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Deinterleaves the complex 16 bit vector into I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
deleted file mode 100644
index 3e7be1e64..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE4_1
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I float vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param scalar The scaling value being multiplied against each data point
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 iFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- __m128i complexVal, iIntVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
- for(;number < quarterPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
- iIntVal = _mm_cvtepi16_epi32(complexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
- _mm_store_ps(iBufferPtr, iFloatValue);
-
- iBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
- sixteenTComplexVectorPtr++;
- }
-
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into I float vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param scalar The scaling value being multiplied against each data point
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
- __m128 iValue;
-
- const float iScalar = 1.0/scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
-
- float floatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-
- iValue = _mm_load_ps(floatBuffer);
-
- iValue = _mm_mul_ps(iValue, invScalar);
-
- _mm_store_ps(iBufferPtr, iValue);
-
- iBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
- complexVectorPtr++;
- }
-
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Deinterleaves the complex 16 bit vector into I float vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param scalar The scaling value being multiplied against each data point
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* iBufferPtr = iBuffer;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
- complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
new file mode 100644
index 000000000..afa21ebc4
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSSE3
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+ int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
deleted file mode 100644
index 2dd85a422..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSSE3
-#include
-/*!
- \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-
- complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-
- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
- complexVal1 = _mm_or_si128(complexVal1, complexVal2);
-
- complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
- complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
-
- complexVal3 = _mm_or_si128(complexVal3, complexVal4);
-
-
- complexVal1 = _mm_srai_epi16(complexVal1, 8);
- complexVal3 = _mm_srai_epi16(complexVal3, 8);
-
- iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
- int16ComplexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
- complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
- \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
- volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
new file mode 100644
index 000000000..d832de5fe
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
@@ -0,0 +1,190 @@
+#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H
+#define INCLUDED_volk_16sc_magnitude_16s_a16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE3
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ float inputFloatBuffer[4] __attribute__((aligned(128)));
+ float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = 32768.0;
+ for(number = 0; number < num_points; number++){
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
deleted file mode 100644
index 41e8751d6..000000000
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
-
-#include
-#include
-#include
-
-#if LV_HAVE_SSE3
-#include
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
-
- __m128 vScalar = _mm_set_ps1(32768.0);
- __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
- __m128 cplxValue1, cplxValue2, result;
-
- float inputFloatBuffer[8] __attribute__((aligned(128)));
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
-
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
- complexVectorPtr += 8;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result); // Square root the values
-
- result = _mm_mul_ps(result, vScalar); // Scale the results
-
- _mm_store_ps(outputFloatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
- const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
- *magnitudeVectorPtr++ = (int16_t)(val1Result);
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
-
- __m128 vScalar = _mm_set_ps1(32768.0);
- __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
- float inputFloatBuffer[4] __attribute__((aligned(128)));
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
-
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
- cplxValue1 = _mm_load_ps(inputFloatBuffer);
- complexVectorPtr += 4;
-
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
- cplxValue2 = _mm_load_ps(inputFloatBuffer);
- complexVectorPtr += 4;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result); // Square root the values
-
- result = _mm_mul_ps(result, vScalar); // Scale the results
-
- _mm_store_ps(outputFloatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
- const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
- *magnitudeVectorPtr++ = (int16_t)(val1Result);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- const float scalar = 32768.0;
- for(number = 0; number < num_points; number++){
- float real = ((float)(*complexVectorPtr++)) / scalar;
- float imag = ((float)(*complexVectorPtr++)) / scalar;
- *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
- volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
deleted file mode 100644
index c2605d551..000000000
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
-
-#include
-#include
-#include
-
-#if LV_HAVE_SSE3
-#include
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
- __m128 cplxValue1, cplxValue2, result;
-
- float inputFloatBuffer[8] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
-
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
- complexVectorPtr += 8;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result); // Square root the values
-
- _mm_store_ps(magnitudeVectorPtr, result);
-
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) / scalar;
- float val1Imag = (float)(*complexVectorPtr++) / scalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
-
- __m128 cplxValue1, cplxValue2, result, re, im;
-
- float inputFloatBuffer[8] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
- re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
- im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
-
- complexVectorPtr += 8;
-
- cplxValue1 = _mm_mul_ps(re, invScalar);
- cplxValue2 = _mm_mul_ps(im, invScalar);
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result); // Square root the values
-
- _mm_store_ps(magnitudeVectorPtr, result);
-
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) * iScalar;
- float val1Imag = (float)(*complexVectorPtr++) * iScalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
-}
-
-
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- float real = ( (float) (*complexVectorPtr++)) * invScalar;
- float imag = ( (float) (*complexVectorPtr++)) * invScalar;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param scalar The data value to be divided against each input data value of the input complex vector
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
new file mode 100644
index 000000000..53e4253c4
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 000000000..7320db368
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+ for(;number < quarterPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar = 1.0/scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
new file mode 100644
index 000000000..649b5cc96
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE3
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+
+ __m128 cplxValue1, cplxValue2, result, re, im;
+
+ float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(re, invScalar);
+ cplxValue2 = _mm_mul_ps(im, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+
+
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ float real = ( (float) (*complexVectorPtr++)) * invScalar;
+ float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h
new file mode 100644
index 000000000..c8128dbab
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_16u_byteswap_a16_H
+#define INCLUDED_volk_16u_byteswap_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){
+ volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
deleted file mode 100644
index 9d19d1a45..000000000
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
-#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
-
-/*!
- \brief Byteswaps (in-place) an aligned vector of int16_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
- uint16_t* inputPtr = intsToSwap;
- __m128i input, left, right, output;
-
- const unsigned int eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
- // Load the 16t values, increment inputPtr later since we're doing it in-place.
- input = _mm_load_si128((__m128i*)inputPtr);
- // Do the two shifts
- left = _mm_slli_epi16(input, 8);
- right = _mm_srli_epi16(input, 8);
- // Or the left and right halves together
- output = _mm_or_si128(left, right);
- // Store the results
- _mm_store_si128((__m128i*)inputPtr, output);
- inputPtr += 8;
- }
-
-
- // Byteswap any remaining points:
- number = eighthPoints*8;
- for(; number < num_points; number++){
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Byteswaps (in-place) an aligned vector of int16_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int point;
- uint16_t* inputPtr = intsToSwap;
- for(point = 0; point < num_points; point++){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
- \brief Byteswaps (in-place) an aligned vector of int16_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
- volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
new file mode 100644
index 000000000..a0f97f94e
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
+#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
+
+#include
+#include
+#include
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include
+#include
+
+static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+
+ xmm0 = _mm_load1_ps(¢er_point_array[0]);
+ xmm6 = _mm_load1_ps(¢er_point_array[1]);
+ xmm7 = _mm_load1_ps(¢er_point_array[2]);
+ xmm8 = _mm_load1_ps(¢er_point_array[3]);
+ //xmm11 = _mm_load1_ps(¢er_point_array[4]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 2) & 3;
+ int i = 0;
+
+ for(; i < bound; ++i) {
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+ //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+ //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+
+ //xmm9 = _mm_add_ps(xmm12, xmm9);
+ }
+
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+ _mm_store_ss(&result, xmm4);
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
+ //center_point_array[4] * fith);
+ }
+
+ result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+ target[0] = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ int i = 0;
+
+ for(; i < num_bytes >> 2; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ //center_point_array[4] * fith);
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth) +
+ //center_point_array[4] * fith) +
+ (center_point_array[4]), i);
+ */
+ }
+
+ result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+
+
+ *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h
new file mode 100644
index 000000000..ba38c310f
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_add_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H
+#define INCLUDED_volk_32f_32f_add_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
new file mode 100644
index 000000000..a0995e631
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H
+#define INCLUDED_volk_32f_32f_divide_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_div_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
new file mode 100644
index 000000000..63f5221d3
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
+#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
+
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_SSE3
+
+#include
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
new file mode 100644
index 000000000..b5fa7d7a4
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
+#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
+
+#include
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_SSE3
+
+#include
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal, bVal, cVal;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+ _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ float dotProductVector[4] __attribute__((aligned(16)));
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/
diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
new file mode 100644
index 000000000..34ea93349
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H
+#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h
new file mode 100644
index 000000000..8ca7a5ba8
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_max_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H
+#define INCLUDED_volk_32f_32f_max_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_max_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h
new file mode 100644
index 000000000..dd05988be
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_min_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H
+#define INCLUDED_volk_32f_32f_min_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_min_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
new file mode 100644
index 000000000..2d004db10
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H
+#define INCLUDED_volk_32f_32f_multiply_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
new file mode 100644
index 000000000..207382a19
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
+#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ float floatBuffer[4] __attribute__((aligned(128)));
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
new file mode 100644
index 000000000..9fea6aa27
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H
+#define INCLUDED_volk_32f_32f_subtract_32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h
deleted file mode 100644
index 7e395cf50..000000000
--- a/volk/include/volk/volk_32f_accumulator_aligned16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
-#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Accumulates the values in the input buffer
- \param result The accumulated result
- \param inputBuffer The buffer of data to be accumulated
- \param num_points The number of values in inputBuffer to be accumulated
-*/
-static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){
- float returnValue = 0;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* aPtr = inputBuffer;
- float tempBuffer[4] __attribute__((aligned(128)));
-
- __m128 accumulator = _mm_setzero_ps();
- __m128 aVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- accumulator = _mm_add_ps(accumulator, aVal);
- aPtr += 4;
- }
- _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
- returnValue = tempBuffer[0];
- returnValue += tempBuffer[1];
- returnValue += tempBuffer[2];
- returnValue += tempBuffer[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Accumulates the values in the input buffer
- \param result The accumulated result
- \param inputBuffer The buffer of data to be accumulated
- \param num_points The number of values in inputBuffer to be accumulated
-*/
-static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){
- const float* aPtr = inputBuffer;
- unsigned int number = 0;
- float returnValue = 0;
-
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h
new file mode 100644
index 000000000..4a3588e6d
--- /dev/null
+++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H
+#define INCLUDED_volk_32f_accumulator_s32f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ float tempBuffer[4] __attribute__((aligned(128)));
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
deleted file mode 100644
index e7d8de265..000000000
--- a/volk/include/volk/volk_32f_add_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H
-#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Adds the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be added
- \param bVector One of the vectors to be added
- \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_add_ps(aVal, bVal);
-
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Adds the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be added
- \param bVector One of the vectors to be added
- \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
- \brief Adds the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be added
- \param bVector One of the vectors to be added
- \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
new file mode 100644
index 000000000..fce77cd04
--- /dev/null
+++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
@@ -0,0 +1,167 @@
+#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
+#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE
+#include
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ float avgPointsVector[4] __attribute__((aligned(128)));
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaning count
+ float validBinCountVector[4] __attribute__((aligned(128)));
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(realDataPoints[number] <= meanAmplitude){
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if(validBinCount > 0.0){
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ }
+ else{
+ localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for(number = 0; number < num_points; number++){
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for(number = 0; number < num_points; number++){
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
+
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
deleted file mode 100644
index ff917525f..000000000
--- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Calculates the spectral noise floor of an input power spectrum
-
- Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
-
- \param realDataPoints The input power spectrum
- \param num_points The number of data points in the input power spectrum vector
- \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
- \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* dataPointsPtr = realDataPoints;
- float avgPointsVector[4] __attribute__((aligned(128)));
-
- __m128 dataPointsVal;
- __m128 avgPointsVal = _mm_setzero_ps();
- // Calculate the sum (for mean) for all points
- for(; number < quarterPoints; number++){
-
- dataPointsVal = _mm_load_ps(dataPointsPtr);
-
- dataPointsPtr += 4;
-
- avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
- }
-
- _mm_store_ps(avgPointsVector, avgPointsVal);
-
- float sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more
- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
- __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
- __m128 vOnesVector = _mm_set_ps1(1.0);
- __m128 vValidBinCount = _mm_setzero_ps();
- avgPointsVal = _mm_setzero_ps();
- __m128 compareMask;
- number = 0;
- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
- for(; number < quarterPoints; number++){
-
- dataPointsVal = _mm_load_ps(dataPointsPtr);
-
- dataPointsPtr += 4;
-
- // Identify which items do not exceed the mean amplitude
- compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
-
- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
- avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-
- // Count the number of bins which do not exceed the mean amplitude
- vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
- }
-
- // Calculate the mean from the remaining data points
- _mm_store_ps(avgPointsVector, avgPointsVal);
-
- sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
-
- // Calculate the number of valid bins from the remaning count
- float validBinCountVector[4] __attribute__((aligned(128)));
- _mm_store_ps(validBinCountVector, vValidBinCount);
-
- float validBinCount = 0;
- validBinCount += validBinCountVector[0];
- validBinCount += validBinCountVector[1];
- validBinCount += validBinCountVector[2];
- validBinCount += validBinCountVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(realDataPoints[number] <= meanAmplitude){
- sumMean += realDataPoints[number];
- validBinCount += 1.0;
- }
- }
-
- float localNoiseFloorAmplitude = 0;
- if(validBinCount > 0.0){
- localNoiseFloorAmplitude = sumMean / validBinCount;
- }
- else{
- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
- }
-
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Calculates the spectral noise floor of an input power spectrum
-
- Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
-
- \param realDataPoints The input power spectrum
- \param num_points The number of data points in the input power spectrum vector
- \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
- \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
- float sumMean = 0.0;
- unsigned int number;
- // find the sum (for mean), etc
- for(number = 0; number < num_points; number++){
- // sum (for mean)
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more)
- const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
-
- // now throw out any bins higher than the mean
- sumMean = 0.0;
- unsigned int newNumDataPoints = num_points;
- for(number = 0; number < num_points; number++){
- if (realDataPoints[number] <= meanAmplitude)
- sumMean += realDataPoints[number];
- else
- newNumDataPoints--;
- }
-
- float localNoiseFloorAmplitude = 0.0;
- if (newNumDataPoints == 0) // in the odd case that all
- localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
- else
- localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
-
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h
deleted file mode 100644
index 7fbabd9c3..000000000
--- a/volk/include/volk/volk_32f_convert_16s_aligned16.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2;
- __m128i intInputVal1, intInputVal2;
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int16_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
deleted file mode 100644
index d2bbdf13a..000000000
--- a/volk/include/volk/volk_32f_convert_16s_unaligned16.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2;
- __m128i intInputVal1, intInputVal2;
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int16_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h
deleted file mode 100644
index 011ef5d0e..000000000
--- a/volk/include/volk/volk_32f_convert_32s_aligned16.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1;
- __m128i intInputVal1;
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int32_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int32_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int32_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
deleted file mode 100644
index a6df826c7..000000000
--- a/volk/include/volk/volk_32f_convert_32s_unaligned16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1;
- __m128i intInputVal1;
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int32_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int32_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int32_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h
new file mode 100644
index 000000000..c303dc118
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_a16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_a16_H
+#define INCLUDED_volk_32f_convert_64f_a16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h
deleted file mode 100644
index 91a855813..000000000
--- a/volk/include/volk/volk_32f_convert_64f_aligned16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
-
- for(;number < quarterPoints; number++){
- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
-
- inputVal = _mm_movehl_ps(inputVal, inputVal);
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h
new file mode 100644
index 000000000..c8de768dc
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_ua16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_ua16_H
+#define INCLUDED_volk_32f_convert_64f_ua16_H
+
+#include
+#include
+
+#if LV_HAVE_SSE2
+#include
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
deleted file mode 100644
index 698e0d446..000000000
--- a/volk/include/volk/volk_32f_convert_64f_unaligned16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
-
- for(;number < quarterPoints; number++){
- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
-
- inputVal = _mm_movehl_ps(inputVal, inputVal);
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h
deleted file mode 100644
index b9487b622..000000000
--- a/volk/include/volk/volk_32f_convert_8s_aligned16.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
- intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
- intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int8_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int8_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
deleted file mode 100644
index e986dbc87..000000000
--- a/volk/include/volk/volk_32f_convert_8s_unaligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE2
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
- intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
- intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int8_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
-
- float outputFloatBuffer[4] __attribute__((aligned(128)));
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_mul_ps(ret, vScalar);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (int8_t)(inputVector[number] * scalar);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
deleted file mode 100644
index c595b5e92..000000000
--- a/volk/include/volk/volk_32f_divide_aligned16.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
-#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief Divides the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector The vector to be divideed
- \param bVector The divisor vector
- \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_div_ps(aVal, bVal);
-
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief Divides the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector The vector to be divideed
- \param bVector The divisor vector
- \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
- \brief Divides the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector The vector to be divideed
- \param bVector The divisor vector
- \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h
deleted file mode 100644
index 3aee1136a..000000000
--- a/volk/include/volk/volk_32f_dot_prod_aligned16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
-#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
-
-#include
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
- aPtr += 4;
- bPtr += 4;
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
-
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#if LV_HAVE_SSE3
-
-#include
-
-static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
- aPtr += 4;
- bPtr += 4;
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include
-
-static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
deleted file mode 100644
index bce6aa15f..000000000
--- a/volk/include/volk/volk_32f_dot_prod_unaligned16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
-
-#include
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
- aPtr += 4;
- bPtr += 4;
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
-
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#if LV_HAVE_SSE3
-
-#include
-
-static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
- aPtr += 4;
- bPtr += 4;
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include
-
-static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
-
- float dotProductVector[4] __attribute__((aligned(16)));
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h
deleted file mode 100644
index c82239d74..000000000
--- a/volk/include/volk/volk_32f_fm_detect_aligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
-#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
-
-#include
-#include
-
-#if LV_HAVE_SSE
-#include
-/*!
- \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
- \param outputVector The byte-aligned vector where the results will be stored.
- \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
- \param bound The interval that the input phase data is in, which is used to modulo the differentiation
- \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
- \param num_noints The number of real values in the input vector.
-*/
-static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 1;
- unsigned int j = 0;
- // num_points-1 keeps Fedora 7's gcc from crashing...
- // num_points won't work. :(
- const unsigned int quarterPoints = (num_points-1) / 4;
-
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
- __m128 upperBound = _mm_set_ps1(bound);
- __m128 lowerBound = _mm_set_ps1(-bound);
- __m128 next3old1;
- __m128 next4;
- __m128 boundAdjust;
- __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
- __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
- // Do the first 4 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- for (; number < quarterPoints; number++) {
- // Load data
- next3old1 = _mm_loadu_ps((float*) (inPtr-1));
- next4 = _mm_load_ps(inPtr);
- inPtr += 4;
- // Subtract and store:
- next3old1 = _mm_sub_ps(next4, next3old1);
- // Bound:
- boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
- boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
- next4 = _mm_cmplt_ps(next3old1, lowerBound);
- next4 = _mm_and_ps(next4, negBoundAdjust);
- boundAdjust = _mm_or_ps(next4, boundAdjust);
- // Make sure we're in the bounding interval:
- next3old1 = _mm_add_ps(next3old1, boundAdjust);
- _mm_store_ps(outPtr,next3old1); // Store the results back into the output
- outPtr += 4;
- }
-
- for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- *saveValue = inputVector[num_points-1];
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
- \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
- \param outputVector The byte-aligned vector where the results will be stored.
- \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
- \param bound The interval that the input phase data is in, which is used to modulo the differentiation
- \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
- \param num_points The number of real values in the input vector.
-*/
-static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 0;
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
-
- // Do the first 1 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
-
- for (number = 1; number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- *saveValue = inputVector[num_points-1];
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h
new file mode 100644
index 000000000..d070e17d5
--- /dev/null
+++ b/volk/include/volk/volk_32f_index_max_16u_a16.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_volk_32f_index_max_16u_a16_H
+#define INCLUDED_volk_32f_index_max_16u_a16_H
+
+#include
+#include
+#include
+
+#if LV_HAVE_SSE4_1
+#include
+
+static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ float maxValuesBuffer[4] __attribute__((aligned(16)));
+ float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#if LV_HAVE_SSE
+#include
+
+static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ float maxValuesBuffer[4] __attribute__((aligned(16)));
+ float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ float max = src0[0];
+ unsigned int index = 0;
+
+ int i = 1;
+
+ for(; i < num_points; ++i) {
+
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/
diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h
deleted file mode 100644
index 26322bfa2..000000000
--- a/volk/include/volk/volk_32f_index_max_aligned16.h
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
-#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
-
-#include
-#include
-#include