volk/kernels/volk/volk_32f_s32f_power_32f.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
#define INCLUDED_volk_32f_s32f_power_32f_a_H

#include <inttypes.h>
#include <stdio.h>
#include <math.h>

#ifdef LV_HAVE_SSE4_1
#include <tmmintrin.h>

#ifdef LV_HAVE_LIB_SIMDMATH
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */

/*!
  \brief Takes each the input vector value to the specified power and stores the results in the return vector
  \param cVector The vector where the results will be stored
  \param aVector The vector of values to be taken to a power
  \param power The power value to be applied to each data point
  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
*/
static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
  unsigned int number = 0;

  float* cPtr = cVector;
  const float* aPtr = aVector;

#ifdef LV_HAVE_LIB_SIMDMATH
  const unsigned int quarterPoints = num_points / 4;
  __m128 vPower = _mm_set_ps1(power);
  __m128 zeroValue = _mm_setzero_ps();
  __m128 signMask;
  __m128 negatedValues;
  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
  __m128 onesMask = _mm_set_ps1(1);

  __m128 aVal, cVal;
  for(;number < quarterPoints; number++){

    aVal = _mm_load_ps(aPtr);
    signMask = _mm_cmplt_ps(aVal, zeroValue);
    negatedValues = _mm_sub_ps(zeroValue, aVal);
    aVal = _mm_blendv_ps(aVal, negatedValues, signMask);

    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
    cVal = powf4(aVal, vPower); // Takes each input value to the specified power

    cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);

    _mm_store_ps(cPtr,cVal); // Store the results back into the C container

    aPtr += 4;
    cPtr += 4;
  }

  number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */

  for(;number < num_points; number++){
    *cPtr++ = powf((*aPtr++), power);
  }
}
#endif /* LV_HAVE_SSE4_1 */

#ifdef LV_HAVE_SSE
#include <xmmintrin.h>

#ifdef LV_HAVE_LIB_SIMDMATH
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */

/*!
  \brief Takes each the input vector value to the specified power and stores the results in the return vector
  \param cVector The vector where the results will be stored
  \param aVector The vector of values to be taken to a power
  \param power The power value to be applied to each data point
  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
*/
static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
  unsigned int number = 0;

  float* cPtr = cVector;
  const float* aPtr = aVector;

#ifdef LV_HAVE_LIB_SIMDMATH
  const unsigned int quarterPoints = num_points / 4;
  __m128 vPower = _mm_set_ps1(power);
  __m128 zeroValue = _mm_setzero_ps();
  __m128 signMask;
  __m128 negatedValues;
  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
  __m128 onesMask = _mm_set_ps1(1);

  __m128 aVal, cVal;
  for(;number < quarterPoints; number++){

    aVal = _mm_load_ps(aPtr);
    signMask = _mm_cmplt_ps(aVal, zeroValue);
    negatedValues = _mm_sub_ps(zeroValue, aVal);
    aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );

    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
    cVal = powf4(aVal, vPower); // Takes each input value to the specified power

    cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);

    _mm_store_ps(cPtr,cVal); // Store the results back into the C container

    aPtr += 4;
    cPtr += 4;
  }

  number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */

  for(;number < num_points; number++){
    *cPtr++ = powf((*aPtr++), power);
  }
}
#endif /* LV_HAVE_SSE */

#ifdef LV_HAVE_GENERIC
  /*!
    \brief Takes each the input vector value to the specified power and stores the results in the return vector
    \param cVector The vector where the results will be stored
    \param aVector The vector of values to be taken to a power
    \param power The power value to be applied to each data point
    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
  */
static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
  float* cPtr = cVector;
  const float* aPtr = aVector;
  unsigned int number = 0;

  for(number = 0; number < num_points; number++){
    *cPtr++ = powf((*aPtr++), power);
  }
}
#endif /* LV_HAVE_GENERIC */


#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */