diff options
-rw-r--r-- | volk/include/volk/volk_32f_normalize_aligned16.h | 15 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_32f_multiply_aligned16.h | 13 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_multiply_aligned16.h | 17 | ||||
-rw-r--r-- | volk/lib/Makefile.am | 2 | ||||
-rw-r--r-- | volk/lib/qa_32f_normalize_aligned16.cc | 13 | ||||
-rw-r--r-- | volk/lib/qa_32fc_32f_multiply_aligned16.cc | 84 | ||||
-rw-r--r-- | volk/lib/qa_32fc_multiply_aligned16.cc | 12 | ||||
-rw-r--r-- | volk/orc/Makefile.am | 3 |
8 files changed, 112 insertions, 47 deletions
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h index 1aabb1d9d..27fb5f7fa 100644 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -60,6 +60,21 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points); +static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h index 436656ca0..304ed8e2d 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h @@ -76,6 +76,19 @@ static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h index 6a1649fdb..c8f2418c3 100644 --- a/volk/include/volk/volk_32fc_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_multiply_aligned16.h @@ -4,6 +4,7 @@ #include <inttypes.h> #include <stdio.h> #include <volk/volk_complex.h> +#include <float.h> #if LV_HAVE_SSE3 #include <pmmintrin.h> @@ -72,6 +73,22 @@ static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 253033461..0aeafe4aa 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -156,6 +156,7 @@ endif # ---------------------------------------------------------------- libvolk_qa_la_SOURCES = \ qa_volk.cc \ + qa_utils.cc \ qa_16s_quad_max_star_aligned16.cc \ qa_32fc_dot_prod_aligned16.cc \ qa_32fc_square_dist_aligned16.cc \ @@ -257,6 +258,7 @@ libvolk_qa_la_LIBADD = \ noinst_HEADERS = \ volk_init.h \ qa_volk.h \ + qa_utils.h \ assembly.h \ qa_16s_quad_max_star_aligned16.h \ qa_32fc_dot_prod_aligned16.h \ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc index 1c7b485a6..0da43ecff 100644 --- a/volk/lib/qa_32f_normalize_aligned16.cc +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -26,13 +26,16 @@ void qa_32f_normalize_aligned16::t1() { float* output0; float* output01; + float* output02; ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); + ret = posix_memalign((void**)&output02, 16, vlen*sizeof(float)); for(int i = 0; i < vlen; ++i) { output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); } memcpy(output01, output0, vlen*sizeof(float)); + memcpy(output02, output0, vlen*sizeof(float)); printf("32f_normalize_aligned\n"); start = clock(); @@ -49,6 +52,14 @@ void qa_32f_normalize_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output02, 1.15, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + for(int i = 0; i < 1; ++i) { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); @@ -57,10 +68,12 @@ void qa_32f_normalize_aligned16::t1() { for(int i = 0; i < vlen; ++i) { // printf("%e...%e\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output02[i], fabs(output0[i])*1e-4); } free(output0); free(output01); + free(output02); } #endif diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc index 4eba0a3cd..7bb8d21c1 100644 --- a/volk/lib/qa_32fc_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -2,28 +2,12 @@ #include <volk/volk.h> #include <qa_32fc_32f_multiply_aligned16.h> #include <stdlib.h> -#include <math.h> #include <time.h> - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); +#include <string.h> +#include <qa_utils.h> #define ERR_DELTA (1e-4) -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifdef LV_HAVE_SSE3 void qa_32fc_32f_multiply_aligned16::t1() { const int vlen = 2046; @@ -36,50 +20,56 @@ void qa_32fc_32f_multiply_aligned16::t1() { std::complex<float>* input; float * taps; int i; + std::vector<std::string> archs; + archs.push_back("generic"); +#ifdef LV_HAVE_SSE3 + archs.push_back("sse3"); +#endif +#ifdef LV_HAVE_ORC + archs.push_back("orc"); +#endif - std::complex<float>* result_generic; - std::complex<float>* result_sse3; + std::vector<std::complex<float>* > results; ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float)); + + for(i=0; i < archs.size(); i++) { + std::complex<float> *ptr; + ret = posix_memalign((void**)&ptr, 16, vlen * 2 * sizeof(float)); + if(ret) { + printf("Couldn't allocate memory\n"); + exit(1); + } + results.push_back(ptr); + } random_floats((float*)input, vlen * 2); random_floats(taps, vlen); printf("32fc_32f_multiply_aligned16\n"); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + for(i=0; i < archs.size(); i++) { + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(results[i], input, taps, vlen, archs[i].c_str()); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("%s_time: %f\n", archs[i].c_str(), total); } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - for(i = 0; i < vlen; i++){ - assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + for(i=0; i < vlen; i++) { + int j = 1; + for(j; j < archs.size(); j++) { + assertcomplexEqual(results[0][i], results[j][i], ERR_DELTA); + } } free(input); free(taps); - free(result_generic); - free(result_sse3); - -} -#else -void qa_32fc_32f_multiply_aligned16::t1() { - printf("sse3 not available... no test performed\n"); + for(i=0; i < archs.size(); i++) { + free(results[i]); + } } -#endif /* LV_HAVE_SSE3 */ - diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc index e1f7eab3d..022b58ad6 100644 --- a/volk/lib/qa_32fc_multiply_aligned16.cc +++ b/volk/lib/qa_32fc_multiply_aligned16.cc @@ -41,11 +41,13 @@ void qa_32fc_multiply_aligned16::t1() { std::complex<float>* result_generic; std::complex<float>* result_sse3; + std::complex<float>* result_orc; ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_orc, 16, vlen*2*sizeof(float)); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); @@ -67,15 +69,25 @@ void qa_32fc_multiply_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_orc, input, taps, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); for(i = 0; i < vlen; i++){ assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + assertcomplexEqual(result_generic[i], result_orc[i], ERR_DELTA); } free(input); free(taps); free(result_generic); free(result_sse3); + free(result_orc); } #else diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index a469638c1..066050a7c 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -34,9 +34,12 @@ volk_32f_add_aligned16_orc_impl.orc \ volk_32f_subtract_aligned16_orc_impl.orc \ volk_32f_divide_aligned16_orc_impl.orc \ volk_32f_multiply_aligned16_orc_impl.orc \ +volk_32fc_multiply_aligned16_orc_impl.orc \ +volk_32fc_32f_multiply_aligned16_orc_impl.orc \ volk_32f_sqrt_aligned16_orc_impl.orc \ volk_32f_max_aligned16_orc_impl.orc \ volk_32f_min_aligned16_orc_impl.orc \ +volk_32f_normalize_aligned16_orc_impl.orc \ volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ volk_32fc_magnitude_16s_aligned16_orc_impl.orc \ volk_16sc_magnitude_16s_aligned16_orc_impl.orc \ |