From 5585c71229cfa7886e0bd090828cd1f5104f6b27 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Fri, 15 Jun 2012 08:43:20 -0400
Subject: filter: adding ssc and fsf versions of filter with associated new
 Volk kernels.

These routines work and pass QA. They could use some performance work. the FSF is just slightly slower than before; the SCC version is more noticably slower.

Both could benefit, probably, by using SSE2 intrinsics to handle the shorts.
---
 volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 122 ++++++++++++++++++++++
 volk/include/volk/volk_32f_x2_dot_prod_16i_a.h    |  98 +++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
 create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_16i_a.h

(limited to 'volk')

diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..940aa5de7
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+  static const int N_UNROLL = 4;
+
+  lv_32fc_t acc0 = 0;
+  lv_32fc_t acc1 = 0;
+  lv_32fc_t acc2 = 0;
+  lv_32fc_t acc3 = 0;
+
+  unsigned i = 0;
+  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+  for(i = 0; i < n; i += N_UNROLL) {
+    acc0 += taps[i + 0] * (float)input[i + 0];
+    acc1 += taps[i + 1] * (float)input[i + 1];
+    acc2 += taps[i + 2] * (float)input[i + 2];
+    acc3 += taps[i + 3] * (float)input[i + 3];
+  }
+
+  for(; i < num_points; i++) {
+    acc0 += taps[i] * (float)input[i];
+  }
+
+  *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 8;
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const short* aPtr = input;
+  const float* bPtr = (float*)taps;
+
+  __m64  m0, m1;
+  __m128 f0, f1, f2, f3;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+    m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+    f0 = _mm_cvtpi16_ps(m0);
+    f1 = _mm_cvtpi16_ps(m0);
+    f2 = _mm_cvtpi16_ps(m1);
+    f3 = _mm_cvtpi16_ps(m1);
+
+    a0Val = _mm_unpacklo_ps(f0, f1);
+    a1Val = _mm_unpackhi_ps(f0, f1);
+    a2Val = _mm_unpacklo_ps(f2, f3);
+    a3Val = _mm_unpackhi_ps(f2, f3);
+
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 8;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  *realpt = dotProductVector[0];
+  *imagpt = dotProductVector[1];
+  *realpt += dotProductVector[2];
+  *imagpt += dotProductVector[3];
+
+  number = sixteenthPoints*8;
+  for(;number < num_points; number++){
+    *realpt += ((*aPtr)   * (*bPtr++));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
-- 
cgit