summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc3
-rw-r--r--volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h122
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_16i_a.h98
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h167
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h187
-rw-r--r--volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h111
-rw-r--r--volk/lib/testqa.cc5
7 files changed, 620 insertions, 73 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 0b4442108..6244abb35 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -33,6 +33,7 @@ int main(int argc, char *argv[]) {
//VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
//VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results);
@@ -49,6 +50,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results);
VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results);
VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results);
@@ -76,6 +78,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results);
VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results);
//VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results);
VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results);
VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results);
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..940aa5de7
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ static const int N_UNROLL = 4;
+
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
+
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+ for(i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
+
+ for(; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
+
+ *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 448b2fdc0..c26fd5e7c 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -31,39 +31,60 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
+ for(;number < sixteenthPoints; number++){
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
}
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
__VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
+ number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -80,38 +101,59 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- cVal = _mm_mul_ps(aVal, bVal);
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+ for(;number < sixteenthPoints; number++){
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
}
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
+ number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -182,4 +224,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 3b7284b57..f9ae15094 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -1,6 +1,7 @@
#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#include <volk/volk_common.h>
#include<stdio.h>
@@ -30,41 +31,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
+ for(;number < sixteenthPoints; number++){
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
}
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
__VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
+ number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
@@ -79,38 +104,59 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+ for(;number < sixteenthPoints; number++){
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
}
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
+ number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -141,15 +187,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
@@ -181,4 +227,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
new file mode 100644
index 000000000..109b787e8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ *realpt = 0;
+ *imagpt = 0;
+
+ for(number = 0; number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+
+ x0Val = _mm_load_ps(bPtr);
+ x1Val = _mm_load_ps(bPtr);
+ x2Val = _mm_load_ps(bPtr+4);
+ x3Val = _mm_load_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index aac676729..813e62217 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
@@ -35,6 +36,7 @@ VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
@@ -52,7 +54,8 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1);