summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/apps/volk_profile.cc1
-rw-r--r--volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h74
-rw-r--r--volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h261
-rw-r--r--volk/lib/qa_utils.cc16
-rw-r--r--volk/lib/qa_utils.h8
-rw-r--r--volk/lib/testqa.cc1
6 files changed, 354 insertions, 7 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 76b9f4031..0b4442108 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -16,6 +16,7 @@ int main(int argc, char *argv[]) {
//VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results);
//VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results);
+ VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results);
VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results);
diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
new file mode 100644
index 000000000..eee9f0064
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
@@ -0,0 +1,74 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points);
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+
+
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
new file mode 100644
index 000000000..f8076adc3
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
@@ -0,0 +1,261 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#define ROTATOR_RELOAD 512
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ *phase = lv_cmake(1.0, 0.0);
+ int i = 0;
+ int j = 0;
+ for(i = 0; i < (int)(num_points/ROTATOR_RELOAD); ++i) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+ (*phase) /= abs((*phase));
+ }
+ for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ *phase = lv_cmake(1.0, 0.0);
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+
+ int i, j = 0;
+
+ for(i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+ for(i = 0; i < (int)(halfPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%2; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ *phase = lv_cmake(1.0, 0.0);
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+
+ int i, j = 0;
+
+ for(i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m256 aVal, phase_Val, inc_Val, cVal, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ zeros = _mm256_set1_ps(0.0);
+ negated = _mm256_set1_ps(-1.0);
+ const unsigned int fourthPoints = num_points / 4;
+
+
+ for(i = 0; i < (int)(fourthPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm256_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index c15979b3f..5bdab260f 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -15,6 +15,7 @@
#include <volk/volk_common.h>
#include <boost/typeof/typeof.hpp>
#include <boost/type_traits.hpp>
+#include <stdio.h>
float uniform() {
return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
@@ -168,6 +169,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
}
//we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
assert(inputsig.size() != 0);
+
}
inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
@@ -261,7 +263,8 @@ bool run_volk_tests(struct volk_func_desc desc,
lv_32fc_t scalar,
int vlen,
int iter,
- std::vector<std::string> *best_arch_vector = 0
+ std::vector<std::string> *best_arch_vector = 0,
+ std::string puppet_master_name = "NULL"
) {
std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
@@ -279,16 +282,16 @@ bool run_volk_tests(struct volk_func_desc desc,
//now we have to get a function signature by parsing the name
std::vector<volk_type_t> inputsig, outputsig;
get_signatures_from_name(inputsig, outputsig, name);
-
+
//pull the input scalars into their own vector
std::vector<volk_type_t> inputsc;
for(size_t i=0; i<inputsig.size(); i++) {
if(inputsig[i].is_scalar) {
inputsc.push_back(inputsig[i]);
inputsig.erase(inputsig.begin() + i);
+ i -= 1;
}
}
-
//for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
//for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
std::vector<void *> inbuffs;
@@ -450,7 +453,12 @@ bool run_volk_tests(struct volk_func_desc desc,
std::cout << "Best arch: " << best_arch << std::endl;
if(best_arch_vector) {
- best_arch_vector->push_back(name + std::string(" ") + best_arch);
+ if(puppet_master_name != "NULL") {
+ best_arch_vector->push_back(name + std::string(" ") + best_arch);
+ }
+ else {
+ best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch);
+ }
}
return fail_global;
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index b998df852..1e639ac3c 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -21,10 +21,12 @@ volk_type_t volk_type_from_string(std::string);
float uniform(void);
void random_floats(float *buf, unsigned n);
-bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *);
+bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0), 0); }
-#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results)
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
+#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func))
typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index d0204fc01..aac676729 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -101,3 +101,4 @@ VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);