summaryrefslogtreecommitdiff
path: root/volk/include/volk/volk_32fc_square_dist_aligned16.h
blob: 6458ea4dd457ed909af707656a3e16daaa2b48a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H
#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H

#include<inttypes.h>
#include<stdio.h>
#include<volk/volk_complex.h>

#if LV_HAVE_SSE3
#include<xmmintrin.h>
#include<pmmintrin.h>

static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
  

  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

  lv_32fc_t diff;
  float sq_dist;
  int bound = num_bytes >> 5;
  int leftovers0 = (num_bytes >> 4) & 1;
  int leftovers1 = (num_bytes >> 3) & 1;
  int i = 0;

  xmm1 = _mm_setzero_ps();
  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
  xmm2 = _mm_load_ps((float*)&points[0]);
  xmm1 = _mm_movelh_ps(xmm1, xmm1);
  xmm3 = _mm_load_ps((float*)&points[2]);
  

  for(; i < bound - 1; ++i) {
    xmm4 = _mm_sub_ps(xmm1, xmm2);
    xmm5 = _mm_sub_ps(xmm1, xmm3);
    points += 4;
    xmm6 = _mm_mul_ps(xmm4, xmm4);
    xmm7 = _mm_mul_ps(xmm5, xmm5);
    
    xmm2 = _mm_load_ps((float*)&points[0]);
    
    xmm4 = _mm_hadd_ps(xmm6, xmm7);

    xmm3 = _mm_load_ps((float*)&points[2]);

    _mm_store_ps(target, xmm4);

    target += 4;

  }
  
  xmm4 = _mm_sub_ps(xmm1, xmm2);
  xmm5 = _mm_sub_ps(xmm1, xmm3);
  
  

  points += 4;
  xmm6 = _mm_mul_ps(xmm4, xmm4);
  xmm7 = _mm_mul_ps(xmm5, xmm5);
    
  xmm4 = _mm_hadd_ps(xmm6, xmm7);
   
  _mm_store_ps(target, xmm4);
  
  target += 4;

  for(i = 0; i < leftovers0; ++i) {
    
    xmm2 = _mm_load_ps((float*)&points[0]);
    
    xmm4 = _mm_sub_ps(xmm1, xmm2);
    
    points += 2;
    
    xmm6 = _mm_mul_ps(xmm4, xmm4);

    xmm4 = _mm_hadd_ps(xmm6, xmm6);
    
    _mm_storeh_pi((__m64*)target, xmm4);

    target += 2;
  }

  for(i = 0; i < leftovers1; ++i) {
    
    diff = src0[0] - points[0];

    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);

    target[0] = sq_dist;
  }
}

#endif /*LV_HAVE_SSE3*/

#if LV_HAVE_GENERIC
static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
  lv_32fc_t diff;
  float sq_dist;
  int i = 0; 
  
  for(; i < num_bytes >> 3; ++i) {
    diff = src0[0] - points[i];

    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
    
    target[i] = sq_dist;
  }
}

#endif /*LV_HAVE_GENERIC*/


#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/