diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h | 74 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 262 | ||||
-rw-r--r-- | volk/include/volk/volk_64u_popcnt_a.h | 4 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 16 | ||||
-rw-r--r-- | volk/lib/qa_utils.h | 8 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
7 files changed, 357 insertions, 9 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 76b9f4031..0b4442108 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -16,6 +16,7 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results); + VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results); VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results); VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results); diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h new file mode 100644 index 000000000..eee9f0064 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h @@ -0,0 +1,74 @@ +#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H + + +#include <volk/volk_complex.h> +#include <stdio.h> +#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h> + + +#ifdef LV_HAVE_GENERIC + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points); + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points); + +} + + + + +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + + + +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ + lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; + volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points); + +} + +#endif /* LV_HAVE_AVX */ + + + + + + + + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h new file mode 100644 index 000000000..80c55e75f --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -0,0 +1,262 @@ +#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H + + +#include <volk/volk_complex.h> +#include <stdio.h> +#include <stdlib.h> +#define ROTATOR_RELOAD 512 + + +#ifdef LV_HAVE_GENERIC + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + unsigned int i = 0; + int j = 0; + for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= phase_inc; + } + (*phase) /= abs((*phase)); + } + for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= phase_inc; + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; + + unsigned int i, j = 0; + + for(i = 0; i < 2; ++i) { + phase_Ptr[i] *= incr; + incr *= (phase_inc); + } + + /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); + printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); + printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); + printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); + printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ + __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm_loadu_ps((float*)phase_Ptr); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + + const unsigned int halfPoints = num_points / 2; + + + for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + phase_Val = _mm_div_ps(phase_Val, tmp1); + } + for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + + _mm_storeu_ps((float*)phase_Ptr, phase_Val); + for(i = 0; i < num_points%2; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (phase_inc); + } + + (*phase) = phase_Ptr[0]; + +} + +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_AVX +#include <immintrin.h> + +/*! + \brief rotate input vector at fixed rate per sample from initial phase offset + \param outVector The vector where the results will be stored + \param inVector Vector to be rotated + \param phase_inc rotational velocity + \param phase initial phase offset + \param num_points The number of values in inVector to be rotated and stored into cVector +*/ + + + + +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + *phase = lv_cmake(1.0, 0.0); + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + + unsigned int i, j = 0; + + for(i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (phase_inc); + } + + /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); + printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); + printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); + printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); + printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); + inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + zeros = _mm256_set1_ps(0.0); + negated = _mm256_set1_ps(-1.0); + const unsigned int fourthPoints = num_points / 4; + + + for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { + for(j = 0; j < ROTATOR_RELOAD; ++j) { + + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = _mm256_mul_ps(aVal, yl); + tmp1p = _mm256_mul_ps(phase_Val, ylp); + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_addsub_ps(tmp1, tmp2); + phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + phase_Val = _mm256_div_ps(phase_Val, tmp1); + } + for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = _mm256_mul_ps(aVal, yl); + + tmp1p = _mm256_mul_ps(phase_Val, ylp); + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_addsub_ps(tmp1, tmp2); + phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); + for(i = 0; i < num_points%4; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (phase_inc); + } + + (*phase) = phase_Ptr[0]; + +} + +#endif /* LV_HAVE_AVX */ + + + + + + + + +#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 7d7359ccf..5e68ed208 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -14,7 +14,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value // This is faster than a lookup table //uint32_t retVal = valueVector[0]; - uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); + uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -24,7 +24,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value uint64_t retVal64 = retVal; //retVal = valueVector[1]; - retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); + retVal = (uint32_t)((value & 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index c15979b3f..4e361aece 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -15,6 +15,7 @@ #include <volk/volk_common.h> #include <boost/typeof/typeof.hpp> #include <boost/type_traits.hpp> +#include <stdio.h> float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) @@ -168,6 +169,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, } //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! assert(inputsig.size() != 0); + } inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { @@ -261,7 +263,8 @@ bool run_volk_tests(struct volk_func_desc desc, lv_32fc_t scalar, int vlen, int iter, - std::vector<std::string> *best_arch_vector = 0 + std::vector<std::string> *best_arch_vector = 0, + std::string puppet_master_name = "NULL" ) { std::cout << "RUN_VOLK_TESTS: " << name << std::endl; @@ -279,16 +282,16 @@ bool run_volk_tests(struct volk_func_desc desc, //now we have to get a function signature by parsing the name std::vector<volk_type_t> inputsig, outputsig; get_signatures_from_name(inputsig, outputsig, name); - + //pull the input scalars into their own vector std::vector<volk_type_t> inputsc; for(size_t i=0; i<inputsig.size(); i++) { if(inputsig[i].is_scalar) { inputsc.push_back(inputsig[i]); inputsig.erase(inputsig.begin() + i); + i -= 1; } } - //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl; //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl; std::vector<void *> inbuffs; @@ -450,7 +453,12 @@ bool run_volk_tests(struct volk_func_desc desc, std::cout << "Best arch: " << best_arch << std::endl; if(best_arch_vector) { - best_arch_vector->push_back(name + std::string(" ") + best_arch); + if(puppet_master_name == "NULL") { + best_arch_vector->push_back(name + std::string(" ") + best_arch); + } + else { + best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch); + } } return fail_global; diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index b998df852..1e639ac3c 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -21,10 +21,12 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *); +bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); -#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0), 0); } -#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results) + +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL") +#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func)) typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index d0204fc01..aac676729 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -101,3 +101,4 @@ VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); |