summaryrefslogtreecommitdiff
path: root/volk/lib/qa_32f_sum_of_poly_aligned16.cc
blob: 49477635747763a39c7139bb4d702be01cb39651 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <volk/volk.h>
#include <qa_32f_sum_of_poly_aligned16.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define SNR 30.0
#define CENTER -4.0
#define CUTOFF -5.595
#define ERR_DELTA (1e-4)
#define NUM_ITERS 100000
#define VEC_LEN 64
static float uniform() {
  return ((float) rand() / RAND_MAX);	// uniformly (0, 1)
}

static void
random_floats (float *buf, unsigned n)
{
  unsigned int i = 0;
  for (; i < n; i++) {

    buf[i] =  uniform () * -SNR/2.0;

  }
}


#ifndef LV_HAVE_SSE3

void qa_32f_sum_of_poly_aligned16::t1(){
  printf("sse3 not available... no test performed\n");
}

#else


void qa_32f_sum_of_poly_aligned16::t1(){
  int i = 0;
  
  volk_environment_init();
  int ret;

  const int vlen = VEC_LEN;
  float cutoff = CUTOFF;
  
  float* center_point_array;
  float* target;
  float* target_generic;
  float* src0 ;


  ret = posix_memalign((void**)&center_point_array, 16, 24);
  ret = posix_memalign((void**)&target, 16, 4);
  ret = posix_memalign((void**)&target_generic, 16, 4);
  ret = posix_memalign((void**)&src0, 16, (vlen << 2));
  
 
  random_floats((float*)src0, vlen);
 
  float a = (float)CENTER;
  float etoa = expf(a);
  center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 +
			   (-4.0 * a * a * a)/24.0 + 
			   (3.0 * a * a)/6.0 +
			   (-2.0 * a)/2.0 +
			   (1.0)) * etoa;
  center_point_array[1] = (//(-10.0 * a * a * a)/120.0 +
			   (6.0 * a * a)/24.0 + 
			   (-3.0 * a)/6.0 +
			   (1.0/2.0)) * etoa;
  center_point_array[2] = (//(10.0 * a * a)/120.0 +
			   (-4.0 * a)/24.0 +
			   (1.0/6.0)) * etoa;
  center_point_array[3] = (//(-5.0 * a)/120.0 +
			   (1.0/24.0)) * etoa;
  //center_point_array[4] = ((1.0)/120.0) * etoa;
  center_point_array[4] = (//(a * a * a * a * a)/120.0 +
			   (a * a * a * a)/24.0 +
			   (a * a * a)/-6.0 +
			   (a * a)/2.0 +
			   -a + 1.0) * etoa;
  
  printf("32f_sum_of_poly_aligned16\n");

  clock_t start, end;
  double total;
  
  float my_sum = 0.0;
  start = clock();
  for(int k = 0; k < NUM_ITERS; ++k) {
    float sum = 0.0;
    for(int l = 0; l < vlen; ++l) {
      
      sum += expf(src0[l]);
      
    }
    my_sum = sum;
  }
  
  
  end = clock();  
  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
  printf("exp time: %f\n", total);
  
  start = clock();
  for(int k = 0; k < NUM_ITERS; ++k) {
    
    volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic");
  
  }
  
  
  end = clock();  
  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
  printf("generic time: %f\n", total);
  
  start = clock();
  for(int k = 0; k < NUM_ITERS; ++k) {
    volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3");
  }
  
  end = clock();  
  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
  printf("sse3 approx time: %f\n", total);


  
  printf("exp: %f, sse3: %f\n", my_sum, target[i]);
  CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA);
  

  free(center_point_array);
  free(target);
  free(target_generic);
  free(src0);

  
}

#endif /*LV_HAVE_SSE3*/