summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc10
-rw-r--r--volk/gen/compilers.xml14
-rw-r--r--volk/gen/machines.xml8
-rw-r--r--volk/gen/make_cpuid_c.py2
-rw-r--r--volk/gen/volk_regexp.py3
-rw-r--r--volk/gen/volk_register.py3
-rw-r--r--volk/include/volk/volk_16i_max_star_horizontal_16i_a.h6
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_a.h44
-rw-r--r--volk/include/volk/volk_32fc_index_max_16u_a.h4
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h46
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h8
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h116
-rw-r--r--volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h4
-rw-r--r--volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h4
-rw-r--r--volk/include/volk/volk_common.h2
-rw-r--r--volk/lib/testqa.cc159
-rw-r--r--volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc10
-rw-r--r--volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc18
19 files changed, 356 insertions, 110 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index b9ac9ecc2..10a699872 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -27,8 +27,8 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results);
VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results);
//VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
//VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results);
VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_index_max_16u_a, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results);
VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results);
VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results);
//VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results);
- VOLK_PROFILE(volk_32f_index_max_16u_a, 0, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results);
VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results);
@@ -102,6 +102,8 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results);
VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results);
VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 204600, 1000, &results);
char path[256];
get_config_path(path);
diff --git a/volk/gen/compilers.xml b/volk/gen/compilers.xml
index 70c82e555..005eda2aa 100644
--- a/volk/gen/compilers.xml
+++ b/volk/gen/compilers.xml
@@ -2,9 +2,21 @@
<grammar>
<compiler name="MSVC">
+ <!-- remap the following flags to SSE -->
<remap name="mmmx">arch:SSE</remap>
<remap name="msse">arch:SSE</remap>
+
+ <!-- remap the following flags to SSE2 -->
<remap name="msse2">arch:SSE2</remap>
+
+ <!-- remap the following flags to AVX -->
+ <remap name="msse3">arch:AVX</remap>
+ <remap name="mssse3">arch:AVX</remap>
+ <remap name="msse4.1">arch:AVX</remap>
+ <remap name="msse4.2">arch:AVX</remap>
+ <remap name="mpopcnt">arch:AVX</remap>
+ <remap name="mavx">arch:AVX</remap>
+
<prefix>/</prefix>
</compiler>
@@ -15,4 +27,4 @@
-</grammar> \ No newline at end of file
+</grammar>
diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml
index b872b9fb1..9c19c91c6 100644
--- a/volk/gen/machines.xml
+++ b/volk/gen/machines.xml
@@ -15,9 +15,9 @@
-->
<!--
-Create an SSE2 only machine (without 64/32 inline assembly support).
+Create an SSE2 and AVX only machine (without 64/32 inline assembly support).
This machine is intended to support the MSVC compiler on x86/amd64.
-The MSVC compiler has intrinsic support for SSE and SSE2,
+The MSVC compiler has intrinsic support for SSE, SSE2, AVX
however it does not support the gcc style inline assembly.
-->
@@ -57,6 +57,10 @@ however it does not support the gcc style inline assembly.
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx</archs>
</machine>
+<machine name="avx_only">
+<archs>generic mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx</archs>
+</machine>
+
<machine name="altivec">
<archs>generic altivec</archs>
</machine>
diff --git a/volk/gen/make_cpuid_c.py b/volk/gen/make_cpuid_c.py
index 7281f45a3..2be1123a8 100644
--- a/volk/gen/make_cpuid_c.py
+++ b/volk/gen/make_cpuid_c.py
@@ -39,7 +39,7 @@ struct VOLK_CPU volk_cpu;
//implement get cpuid for gcc compilers using a copy of cpuid.h
#if defined(__GNUC__)
#include <gcc_x86_cpuid.h>
-#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3)
+#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
//implement get cpuid for MSVC compilers using __cpuid intrinsic
#elif defined(_MSC_VER)
diff --git a/volk/gen/volk_regexp.py b/volk/gen/volk_regexp.py
index b83ce5206..eb4ceb54b 100644
--- a/volk/gen/volk_regexp.py
+++ b/volk/gen/volk_regexp.py
@@ -1,5 +1,4 @@
import re
-import string
remove_after_underscore = re.compile("_.*");
space_remove = re.compile(" ");
@@ -10,5 +9,5 @@ replace_volk = re.compile("volk");
def strip_trailing(tostrip, stripstr):
lindex = tostrip.rfind(stripstr)
- tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, "");
+ tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, "");
return tostrip
diff --git a/volk/gen/volk_register.py b/volk/gen/volk_register.py
index cd874e470..0774ece29 100644
--- a/volk/gen/volk_register.py
+++ b/volk/gen/volk_register.py
@@ -4,7 +4,6 @@ import sys
import os
import re
import glob
-import string
from xml.dom import minidom
from volk_regexp import *
from make_cpuid_c import make_cpuid_c
@@ -101,7 +100,7 @@ for filearch in filearchs:
archs_or = "("
for arch in archs:
- archs_or = archs_or + string.upper(arch) + "|";
+ archs_or = archs_or + arch.upper() + "|";
archs_or = archs_or[0:len(archs_or)-1];
archs_or = archs_or + ")";
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
index f60b33a41..a10a62350 100644
--- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
+++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
@@ -1,6 +1,7 @@
#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+#include <volk/volk_common.h>
#include<inttypes.h>
#include<stdio.h>
@@ -21,7 +22,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
- volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
__m128i xmm5, xmm6, xmm7, xmm8;
xmm4 = _mm_load_si128((__m128i*)shufmask0);
@@ -92,8 +93,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-
- _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
p_target = (__m128i*)((int8_t*)p_target + 8);
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
new file mode 100644
index 000000000..37223dc81
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
@@ -0,0 +1,44 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h
index 9566aa32e..125a34582 100644
--- a/volk/include/volk/volk_32fc_index_max_16u_a.h
+++ b/volk/include/volk/volk_32fc_index_max_16u_a.h
@@ -87,8 +87,8 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
xmm2 = _mm_load_ps((float*)src0);
- xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8);
- xmm8 = (__m128i)xmm1;
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
xmm2 = _mm_mul_ps(xmm2, xmm2);
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
new file mode 100644
index 000000000..b27a7259f
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -0,0 +1,46 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
+static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
index f11c93682..02faf86c2 100644
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -96,9 +96,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
in1 = _mm_loadu_ps( (float*) (input+offset) );
in2 = _mm_loadu_ps( (float*) (taps+offset) );
- Rv = in1*in2;
+ Rv = _mm_mul_ps(in1, in2);
fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
- Iv = in1*fehg;
+ Iv = _mm_mul_ps(in1, fehg);
Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
Ivm = _mm_xor_ps( negMask.vec, Iv );
Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
@@ -119,9 +119,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
in1 = _mm_loadu_ps( (float*) (input+offset) );
in2 = _mm_loadu_ps( (float*) (taps+offset) );
- Rv = _mm_and_ps(in1*in2, halfMask.vec);
+ Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
- Iv = _mm_and_ps(in1*fehg, halfMask.vec);
+ Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
Ivm = _mm_xor_ps( negMask.vec, Iv );
Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
new file mode 100644
index 000000000..7c0dba7fd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
@@ -0,0 +1,116 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[num_points - 1] * taps[num_points - 1];
+
+ }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(num_points % 1 != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
index 0bb76f1d1..0c280eb6e 100644
--- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
+++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
@@ -26,8 +26,8 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect
for(;number < quarterPoints; number++){
// Convert into 8 bit values into 16 bit values
- x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
- y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
// Calculate the ar*cr - ai*(-ci) portions
realz = _mm_madd_epi16(x,y);
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
index 3e05608a4..a2c2b04f6 100644
--- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
+++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
@@ -29,8 +29,8 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t*
for(;number < quarterPoints; number++){
// Convert into 8 bit values into 16 bit values
- x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
- y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
// Calculate the ar*cr - ai*(-ci) portions
realz = _mm_madd_epi16(x,y);
diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h
index 2c935d1fb..38263d5f7 100644
--- a/volk/include/volk/volk_common.h
+++ b/volk/include/volk/volk_common.h
@@ -91,4 +91,6 @@ union bit128{
#endif
};
+#define bit128_p(x) ((union bit128 *)(x))
+
#endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 62e62c2f4..fbd4bdea5 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -4,90 +4,89 @@
//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1000);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 100);
-VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1000);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000);
//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
-VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 50);
-VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 100);
+VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1);
//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 100);
-VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 10000);
+VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1);
//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 100);
-VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 2000);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 20460, 5000);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 0, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 3000);
-VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 2000);
-VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 2000);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 10000);
-VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 100);
-VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 100);
-VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 3000);
-VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 3000);
-VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 5000);
-VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 10000);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 10000);
-VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 2000);
+VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1000);
-VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1000);
+VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 3000);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 3000);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 3000);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 3000);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 10000);
-VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 400);
-VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 400);
-VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 20000);
-VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 2000);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 2000);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 2000);
-
+VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
diff --git a/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc
new file mode 100644
index 000000000..ea23fc045
--- /dev/null
+++ b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.floatparam 4 scalar
+mulf dst, src1, scalar
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc
index 505e73f5d..d3bf78935 100644
--- a/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc
+++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc
@@ -9,15 +9,15 @@
.temp 4 sumf
.temp 4 rootf
.temp 4 rootl
-.temp 4 maskl
+#.temp 4 maskl
x2 mulf prodiqf, src, src
splitql qf, if, prodiqf
addf sumf, if, qf
sqrtf rootf, sumf
mulf rootf, rootf, scalar
-cmpltf maskl, 32768.0, rootf
-andl maskl, maskl, 0x80000000
-orl rootf, rootf, maskl
+#cmpltf maskl, 32768.0, rootf
+#andl maskl, maskl, 0x80000000
+#orl rootf, rootf, maskl
convfl rootl, rootf
-convssslw dst, rootl
+convsuslw dst, rootl
diff --git a/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc
new file mode 100644
index 000000000..2577e034f
--- /dev/null
+++ b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,18 @@
+.function volk_32fc_s32fc_multiply_32fc_a_orc_impl
+.source 8 src1
+.floatparam 8 scalar
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, scalar
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, scalar
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag