1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
#ifndef INCLUDED_volk_32f_index_max_16u_a_H
#define INCLUDED_volk_32f_index_max_16u_a_H
#include <volk/volk_common.h>
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
float max = src0[0];
float index = 0;
__m128 maxValues = _mm_set1_ps(max);
__m128 maxValuesIndex = _mm_setzero_ps();
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
for(;number < quarterPoints; number++){
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++){
if(maxValuesBuffer[number] > max){
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
}
}
number = quarterPoints * 4;
for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
}
}
target[0] = (unsigned int)index;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE
#include<xmmintrin.h>
static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
float max = src0[0];
float index = 0;
__m128 maxValues = _mm_set1_ps(max);
__m128 maxValuesIndex = _mm_setzero_ps();
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
for(;number < quarterPoints; number++){
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++){
if(maxValuesBuffer[number] > max){
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
}
}
number = quarterPoints * 4;
for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
}
}
target[0] = (unsigned int)index;
}
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_GENERIC
static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
float max = src0[0];
unsigned int index = 0;
unsigned int i = 1;
for(; i < num_points; ++i) {
if(src0[i] > max){
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
|