From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Dec 2010 18:50:28 -0500 Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This is a new SIMD library. It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time. --- volk/lib/Makefile.am | 361 +++++++++++++++++++++ volk/lib/assembly.h | 67 ++++ volk/lib/cpuid_x86.S | 60 ++++ volk/lib/cpuid_x86_64.S | 54 +++ volk/lib/qa_16s_add_quad_aligned16.cc | 89 +++++ volk/lib/qa_16s_add_quad_aligned16.h | 18 + volk/lib/qa_16s_branch_4_state_8_aligned16.cc | 106 ++++++ volk/lib/qa_16s_branch_4_state_8_aligned16.h | 18 + volk/lib/qa_16s_convert_32f_aligned16.cc | 73 +++++ volk/lib/qa_16s_convert_32f_aligned16.h | 18 + volk/lib/qa_16s_convert_32f_unaligned16.cc | 73 +++++ volk/lib/qa_16s_convert_32f_unaligned16.h | 18 + volk/lib/qa_16s_convert_8s_aligned16.cc | 60 ++++ volk/lib/qa_16s_convert_8s_aligned16.h | 18 + volk/lib/qa_16s_convert_8s_unaligned16.cc | 60 ++++ volk/lib/qa_16s_convert_8s_unaligned16.h | 18 + volk/lib/qa_16s_max_star_aligned16.cc | 65 ++++ volk/lib/qa_16s_max_star_aligned16.h | 18 + volk/lib/qa_16s_max_star_horizontal_aligned16.cc | 79 +++++ volk/lib/qa_16s_max_star_horizontal_aligned16.h | 18 + .../lib/qa_16s_permute_and_scalar_add_aligned16.cc | 78 +++++ volk/lib/qa_16s_permute_and_scalar_add_aligned16.h | 18 + volk/lib/qa_16s_quad_max_star_aligned16.cc | 59 ++++ volk/lib/qa_16s_quad_max_star_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 76 +++++ volk/lib/qa_16sc_deinterleave_16s_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 63 ++++ volk/lib/qa_16sc_deinterleave_32f_aligned16.h | 18 + .../lib/qa_16sc_deinterleave_real_16s_aligned16.cc | 71 ++++ volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h | 18 + .../lib/qa_16sc_deinterleave_real_32f_aligned16.cc | 123 +++++++ volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 60 ++++ volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h | 18 + volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 70 ++++ volk/lib/qa_16sc_magnitude_16s_aligned16.h | 18 + volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 70 ++++ volk/lib/qa_16sc_magnitude_32f_aligned16.h | 18 + volk/lib/qa_16u_byteswap_aligned16.cc | 60 ++++ volk/lib/qa_16u_byteswap_aligned16.h | 18 + volk/lib/qa_32f_accumulator_aligned16.cc | 56 ++++ volk/lib/qa_32f_accumulator_aligned16.h | 18 + volk/lib/qa_32f_add_aligned16.cc | 60 ++++ volk/lib/qa_32f_add_aligned16.h | 18 + .../qa_32f_calc_spectral_noise_floor_aligned16.cc | 59 ++++ .../qa_32f_calc_spectral_noise_floor_aligned16.h | 18 + volk/lib/qa_32f_convert_16s_aligned16.cc | 70 ++++ volk/lib/qa_32f_convert_16s_aligned16.h | 18 + volk/lib/qa_32f_convert_16s_unaligned16.cc | 70 ++++ volk/lib/qa_32f_convert_16s_unaligned16.h | 18 + volk/lib/qa_32f_convert_32s_aligned16.cc | 70 ++++ volk/lib/qa_32f_convert_32s_aligned16.h | 18 + volk/lib/qa_32f_convert_32s_unaligned16.cc | 70 ++++ volk/lib/qa_32f_convert_32s_unaligned16.h | 18 + volk/lib/qa_32f_convert_64f_aligned16.cc | 60 ++++ volk/lib/qa_32f_convert_64f_aligned16.h | 18 + volk/lib/qa_32f_convert_64f_unaligned16.cc | 60 ++++ volk/lib/qa_32f_convert_64f_unaligned16.h | 18 + volk/lib/qa_32f_convert_8s_aligned16.cc | 70 ++++ volk/lib/qa_32f_convert_8s_aligned16.h | 18 + volk/lib/qa_32f_convert_8s_unaligned16.cc | 70 ++++ volk/lib/qa_32f_convert_8s_unaligned16.h | 18 + volk/lib/qa_32f_divide_aligned16.cc | 60 ++++ volk/lib/qa_32f_divide_aligned16.h | 18 + volk/lib/qa_32f_dot_prod_aligned16.cc | 183 +++++++++++ volk/lib/qa_32f_dot_prod_aligned16.h | 18 + volk/lib/qa_32f_dot_prod_unaligned16.cc | 190 +++++++++++ volk/lib/qa_32f_dot_prod_unaligned16.h | 18 + volk/lib/qa_32f_fm_detect_aligned16.cc | 60 ++++ volk/lib/qa_32f_fm_detect_aligned16.h | 18 + volk/lib/qa_32f_index_max_aligned16.cc | 103 ++++++ volk/lib/qa_32f_index_max_aligned16.h | 18 + volk/lib/qa_32f_interleave_16sc_aligned16.cc | 75 +++++ volk/lib/qa_32f_interleave_16sc_aligned16.h | 18 + volk/lib/qa_32f_interleave_32fc_aligned16.cc | 62 ++++ volk/lib/qa_32f_interleave_32fc_aligned16.h | 18 + volk/lib/qa_32f_max_aligned16.cc | 60 ++++ volk/lib/qa_32f_max_aligned16.h | 18 + volk/lib/qa_32f_min_aligned16.cc | 60 ++++ volk/lib/qa_32f_min_aligned16.h | 18 + volk/lib/qa_32f_multiply_aligned16.cc | 60 ++++ volk/lib/qa_32f_multiply_aligned16.h | 18 + volk/lib/qa_32f_normalize_aligned16.cc | 65 ++++ volk/lib/qa_32f_normalize_aligned16.h | 18 + volk/lib/qa_32f_power_aligned16.cc | 95 ++++++ volk/lib/qa_32f_power_aligned16.h | 18 + volk/lib/qa_32f_sqrt_aligned16.cc | 59 ++++ volk/lib/qa_32f_sqrt_aligned16.h | 18 + volk/lib/qa_32f_stddev_aligned16.cc | 74 +++++ volk/lib/qa_32f_stddev_aligned16.h | 18 + volk/lib/qa_32f_stddev_and_mean_aligned16.cc | 75 +++++ volk/lib/qa_32f_stddev_and_mean_aligned16.h | 18 + volk/lib/qa_32f_subtract_aligned16.cc | 60 ++++ volk/lib/qa_32f_subtract_aligned16.h | 18 + volk/lib/qa_32f_sum_of_poly_aligned16.cc | 142 ++++++++ volk/lib/qa_32f_sum_of_poly_aligned16.h | 18 + volk/lib/qa_32fc_32f_multiply_aligned16.cc | 85 +++++ volk/lib/qa_32fc_32f_multiply_aligned16.h | 18 + volk/lib/qa_32fc_32f_power_32fc_aligned16.cc | 83 +++++ volk/lib/qa_32fc_32f_power_32fc_aligned16.h | 18 + volk/lib/qa_32fc_atan2_32f_aligned16.cc | 75 +++++ volk/lib/qa_32fc_atan2_32f_aligned16.h | 18 + volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc | 137 ++++++++ volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h | 18 + volk/lib/qa_32fc_deinterleave_32f_aligned16.cc | 63 ++++ volk/lib/qa_32fc_deinterleave_32f_aligned16.h | 18 + volk/lib/qa_32fc_deinterleave_64f_aligned16.cc | 63 ++++ volk/lib/qa_32fc_deinterleave_64f_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_16s_aligned16.cc | 60 ++++ volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_32f_aligned16.cc | 60 ++++ volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_64f_aligned16.cc | 60 ++++ volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h | 18 + volk/lib/qa_32fc_dot_prod_aligned16.cc | 214 ++++++++++++ volk/lib/qa_32fc_dot_prod_aligned16.h | 20 ++ volk/lib/qa_32fc_index_max_aligned16.cc | 89 +++++ volk/lib/qa_32fc_index_max_aligned16.h | 18 + volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 70 ++++ volk/lib/qa_32fc_magnitude_16s_aligned16.h | 18 + volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 70 ++++ volk/lib/qa_32fc_magnitude_32f_aligned16.h | 18 + volk/lib/qa_32fc_multiply_aligned16.cc | 86 +++++ volk/lib/qa_32fc_multiply_aligned16.h | 18 + ...qa_32fc_power_spectral_density_32f_aligned16.cc | 63 ++++ .../qa_32fc_power_spectral_density_32f_aligned16.h | 18 + volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc | 63 ++++ volk/lib/qa_32fc_power_spectrum_32f_aligned16.h | 18 + volk/lib/qa_32fc_square_dist_aligned16.cc | 91 ++++++ volk/lib/qa_32fc_square_dist_aligned16.h | 18 + .../qa_32fc_square_dist_scalar_mult_aligned16.cc | 96 ++++++ .../qa_32fc_square_dist_scalar_mult_aligned16.h | 18 + volk/lib/qa_32s_and_aligned16.cc | 60 ++++ volk/lib/qa_32s_and_aligned16.h | 18 + volk/lib/qa_32s_convert_32f_aligned16.cc | 60 ++++ volk/lib/qa_32s_convert_32f_aligned16.h | 18 + volk/lib/qa_32s_convert_32f_unaligned16.cc | 60 ++++ volk/lib/qa_32s_convert_32f_unaligned16.h | 18 + volk/lib/qa_32s_or_aligned16.cc | 60 ++++ volk/lib/qa_32s_or_aligned16.h | 18 + volk/lib/qa_32u_byteswap_aligned16.cc | 59 ++++ volk/lib/qa_32u_byteswap_aligned16.h | 18 + volk/lib/qa_32u_popcnt_aligned16.cc | 61 ++++ volk/lib/qa_32u_popcnt_aligned16.h | 18 + volk/lib/qa_64f_convert_32f_aligned16.cc | 60 ++++ volk/lib/qa_64f_convert_32f_aligned16.h | 18 + volk/lib/qa_64f_convert_32f_unaligned16.cc | 60 ++++ volk/lib/qa_64f_convert_32f_unaligned16.h | 18 + volk/lib/qa_64f_max_aligned16.cc | 60 ++++ volk/lib/qa_64f_max_aligned16.h | 18 + volk/lib/qa_64f_min_aligned16.cc | 60 ++++ volk/lib/qa_64f_min_aligned16.h | 18 + volk/lib/qa_64u_byteswap_aligned16.cc | 59 ++++ volk/lib/qa_64u_byteswap_aligned16.h | 18 + volk/lib/qa_64u_popcnt_aligned16.cc | 61 ++++ volk/lib/qa_64u_popcnt_aligned16.h | 18 + volk/lib/qa_8s_convert_16s_aligned16.cc | 63 ++++ volk/lib/qa_8s_convert_16s_aligned16.h | 18 + volk/lib/qa_8s_convert_16s_unaligned16.cc | 63 ++++ volk/lib/qa_8s_convert_16s_unaligned16.h | 18 + volk/lib/qa_8s_convert_32f_aligned16.cc | 63 ++++ volk/lib/qa_8s_convert_32f_aligned16.h | 18 + volk/lib/qa_8s_convert_32f_unaligned16.cc | 63 ++++ volk/lib/qa_8s_convert_32f_unaligned16.h | 18 + volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 67 ++++ volk/lib/qa_8sc_deinterleave_16s_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_32f_aligned16.cc | 134 ++++++++ volk/lib/qa_8sc_deinterleave_32f_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc | 64 ++++ volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc | 138 ++++++++ volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc | 60 ++++ volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h | 18 + .../qa_8sc_multiply_conjugate_16sc_aligned16.cc | 87 +++++ .../lib/qa_8sc_multiply_conjugate_16sc_aligned16.h | 18 + .../qa_8sc_multiply_conjugate_32fc_aligned16.cc | 87 +++++ .../lib/qa_8sc_multiply_conjugate_32fc_aligned16.h | 18 + volk/lib/qa_volk.cc | 211 ++++++++++++ volk/lib/qa_volk.h | 36 ++ volk/lib/test_all.cc | 82 +++++ volk/lib/volk_rank_archs.c | 13 + volk/lib/volk_rank_archs.h | 14 + 183 files changed, 9136 insertions(+) create mode 100644 volk/lib/Makefile.am create mode 100644 volk/lib/assembly.h create mode 100644 volk/lib/cpuid_x86.S create mode 100644 volk/lib/cpuid_x86_64.S create mode 100644 volk/lib/qa_16s_add_quad_aligned16.cc create mode 100644 volk/lib/qa_16s_add_quad_aligned16.h create mode 100644 volk/lib/qa_16s_branch_4_state_8_aligned16.cc create mode 100644 volk/lib/qa_16s_branch_4_state_8_aligned16.h create mode 100644 volk/lib/qa_16s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_16s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_16s_convert_8s_aligned16.cc create mode 100644 volk/lib/qa_16s_convert_8s_aligned16.h create mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.cc create mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.h create mode 100644 volk/lib/qa_16s_max_star_aligned16.cc create mode 100644 volk/lib/qa_16s_max_star_aligned16.h create mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.cc create mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.h create mode 100644 volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc create mode 100644 volk/lib/qa_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/lib/qa_16s_quad_max_star_aligned16.cc create mode 100644 volk/lib/qa_16s_quad_max_star_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.h create mode 100644 volk/lib/qa_16u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_16u_byteswap_aligned16.h create mode 100644 volk/lib/qa_32f_accumulator_aligned16.cc create mode 100644 volk/lib/qa_32f_accumulator_aligned16.h create mode 100644 volk/lib/qa_32f_add_aligned16.cc create mode 100644 volk/lib/qa_32f_add_aligned16.h create mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc create mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h create mode 100644 volk/lib/qa_32f_convert_16s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_16s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_32s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_32s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_64f_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_64f_aligned16.h create mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_8s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_8s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.h create mode 100644 volk/lib/qa_32f_divide_aligned16.cc create mode 100644 volk/lib/qa_32f_divide_aligned16.h create mode 100644 volk/lib/qa_32f_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32f_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.cc create mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.h create mode 100644 volk/lib/qa_32f_fm_detect_aligned16.cc create mode 100644 volk/lib/qa_32f_fm_detect_aligned16.h create mode 100644 volk/lib/qa_32f_index_max_aligned16.cc create mode 100644 volk/lib/qa_32f_index_max_aligned16.h create mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.cc create mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.h create mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.cc create mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.h create mode 100644 volk/lib/qa_32f_max_aligned16.cc create mode 100644 volk/lib/qa_32f_max_aligned16.h create mode 100644 volk/lib/qa_32f_min_aligned16.cc create mode 100644 volk/lib/qa_32f_min_aligned16.h create mode 100644 volk/lib/qa_32f_multiply_aligned16.cc create mode 100644 volk/lib/qa_32f_multiply_aligned16.h create mode 100644 volk/lib/qa_32f_normalize_aligned16.cc create mode 100644 volk/lib/qa_32f_normalize_aligned16.h create mode 100644 volk/lib/qa_32f_power_aligned16.cc create mode 100644 volk/lib/qa_32f_power_aligned16.h create mode 100644 volk/lib/qa_32f_sqrt_aligned16.cc create mode 100644 volk/lib/qa_32f_sqrt_aligned16.h create mode 100644 volk/lib/qa_32f_stddev_aligned16.cc create mode 100644 volk/lib/qa_32f_stddev_aligned16.h create mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.cc create mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.h create mode 100644 volk/lib/qa_32f_subtract_aligned16.cc create mode 100644 volk/lib/qa_32f_subtract_aligned16.h create mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.cc create mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.h create mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.cc create mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.h create mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.cc create mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h create mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32fc_index_max_aligned16.cc create mode 100644 volk/lib/qa_32fc_index_max_aligned16.h create mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.cc create mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.h create mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_multiply_aligned16.cc create mode 100644 volk/lib/qa_32fc_multiply_aligned16.h create mode 100644 volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_square_dist_aligned16.cc create mode 100644 volk/lib/qa_32fc_square_dist_aligned16.h create mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc create mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/lib/qa_32s_and_aligned16.cc create mode 100644 volk/lib/qa_32s_and_aligned16.h create mode 100644 volk/lib/qa_32s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_32s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_32s_or_aligned16.cc create mode 100644 volk/lib/qa_32s_or_aligned16.h create mode 100644 volk/lib/qa_32u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_32u_byteswap_aligned16.h create mode 100644 volk/lib/qa_32u_popcnt_aligned16.cc create mode 100644 volk/lib/qa_32u_popcnt_aligned16.h create mode 100644 volk/lib/qa_64f_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_64f_convert_32f_aligned16.h create mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_64f_max_aligned16.cc create mode 100644 volk/lib/qa_64f_max_aligned16.h create mode 100644 volk/lib/qa_64f_min_aligned16.cc create mode 100644 volk/lib/qa_64f_min_aligned16.h create mode 100644 volk/lib/qa_64u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_64u_byteswap_aligned16.h create mode 100644 volk/lib/qa_64u_popcnt_aligned16.cc create mode 100644 volk/lib/qa_64u_popcnt_aligned16.h create mode 100644 volk/lib/qa_8s_convert_16s_aligned16.cc create mode 100644 volk/lib/qa_8s_convert_16s_aligned16.h create mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.cc create mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.h create mode 100644 volk/lib/qa_8s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_8s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc create mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h create mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc create mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/lib/qa_volk.cc create mode 100644 volk/lib/qa_volk.h create mode 100644 volk/lib/test_all.cc create mode 100644 volk/lib/volk_rank_archs.c create mode 100644 volk/lib/volk_rank_archs.h (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am new file mode 100644 index 000000000..97eb75680 --- /dev/null +++ b/volk/lib/Makefile.am @@ -0,0 +1,361 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS) + + +# We build 2 libraries and 1 executable here. One library contains +# everything except the libcppunit QA code, and one contains only the +# libcppunit-based QA code. The C++ QA code is especially recommended +# when you have general purpose C or C++ code that may not get +# thoroughly exercised by building and running a GR block. The +# executable runs the QA code at "make check" time. +# +# N.B., If there's a SWIG generated shared library and associated +# python code, it will be contained in ../python, not here. (That +# code is conditionally built depending on the state of the +# --without-python configure option.) However, the .i should be here +# next to the .h that it's based on. + + +# list of programs run by "make check" and "make distcheck" +TESTS = test_all + + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la \ + libvolk_qa.la + + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +universal_runtime_CODE = \ + volk_runtime.c \ + volk_init.c \ + volk_rank_archs.c + +universal_CODE = \ + volk.c \ + volk_environment_init.c + +generic_CODE = \ + volk_cpu_generic.cc + +x86_CODE = \ + volk_cpu_x86.c + +x86_SUBCODE = \ + cpuid_x86.S + +x86_64_SUBCODE = \ + cpuid_x86_64.S + +powerpc_CODE = \ + volk_cpu_powerpc.cc + + +if MD_CPU_generic +libvolk_la_SOURCES = \ + $(generic_CODE) \ + $(universal_CODE) +libvolk_runtime_la_SOURCES = \ + $(generic_CODE) \ + $(universal_runtime_CODE) + +endif + +if MD_CPU_x86 +if MD_SUBCPU_x86_64 +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_runtime_CODE) +else +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_runtime_CODE) +endif +endif + + +if MD_CPU_powerpc +libvolk_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_runtime_CODE) +endif + + + +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_la_LIBADD = + + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +libvolk_qa_la_SOURCES = \ + qa_volk.cc \ + qa_16s_quad_max_star_aligned16.cc \ + qa_32fc_dot_prod_aligned16.cc \ + qa_32fc_square_dist_aligned16.cc \ + qa_32fc_square_dist_scalar_mult_aligned16.cc \ + qa_32f_sum_of_poly_aligned16.cc \ + qa_32fc_index_max_aligned16.cc \ + qa_32f_index_max_aligned16.cc \ + qa_32fc_conjugate_dot_prod_aligned16.cc \ + qa_16s_permute_and_scalar_add_aligned16.cc \ + qa_16s_branch_4_state_8_aligned16.cc \ + qa_16s_max_star_horizontal_aligned16.cc \ + qa_16s_max_star_aligned16.cc \ + qa_16s_add_quad_aligned16.cc \ + qa_32f_add_aligned16.cc \ + qa_32f_subtract_aligned16.cc \ + qa_32f_max_aligned16.cc \ + qa_32f_min_aligned16.cc \ + qa_64f_max_aligned16.cc \ + qa_64f_min_aligned16.cc \ + qa_32s_and_aligned16.cc \ + qa_32s_or_aligned16.cc \ + qa_32f_dot_prod_aligned16.cc \ + qa_32f_dot_prod_unaligned16.cc \ + qa_32f_fm_detect_aligned16.cc \ + qa_32fc_32f_multiply_aligned16.cc \ + qa_32fc_multiply_aligned16.cc \ + qa_32f_divide_aligned16.cc \ + qa_32f_multiply_aligned16.cc \ + qa_32f_sqrt_aligned16.cc \ + qa_8sc_multiply_conjugate_16sc_aligned16.cc \ + qa_8sc_multiply_conjugate_32fc_aligned16.cc \ + qa_32u_popcnt_aligned16.cc \ + qa_64u_popcnt_aligned16.cc \ + qa_64u_byteswap_aligned16.cc \ + qa_8sc_deinterleave_32f_aligned16.cc \ + qa_16sc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_16s_aligned16.cc \ + qa_32f_interleave_32fc_aligned16.cc \ + qa_16u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_16s_aligned16.cc \ + qa_32fc_deinterleave_real_32f_aligned16.cc \ + qa_32fc_magnitude_32f_aligned16.cc \ + qa_32fc_deinterleave_real_64f_aligned16.cc \ + qa_32fc_deinterleave_real_16s_aligned16.cc \ + qa_32fc_magnitude_16s_aligned16.cc \ + qa_32fc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_real_8s_aligned16.cc \ + qa_32fc_deinterleave_64f_aligned16.cc \ + qa_32f_interleave_16sc_aligned16.cc \ + qa_16sc_deinterleave_real_8s_aligned16.cc \ + qa_16sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_32f_aligned16.cc \ + qa_32u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_real_16s_aligned16.cc \ + qa_8sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_16s_aligned16.cc \ + qa_32f_normalize_aligned16.cc \ + qa_8sc_deinterleave_real_16s_aligned16.cc \ + qa_16s_convert_32f_aligned16.cc \ + qa_16s_convert_32f_unaligned16.cc \ + qa_16s_convert_8s_aligned16.cc \ + qa_16s_convert_8s_unaligned16.cc \ + qa_32f_convert_16s_aligned16.cc \ + qa_32f_convert_16s_unaligned16.cc \ + qa_32f_convert_32s_aligned16.cc \ + qa_32f_convert_32s_unaligned16.cc \ + qa_32f_convert_64f_aligned16.cc \ + qa_32f_convert_64f_unaligned16.cc \ + qa_32f_convert_8s_aligned16.cc \ + qa_32f_convert_8s_unaligned16.cc \ + qa_32s_convert_32f_aligned16.cc \ + qa_32s_convert_32f_unaligned16.cc \ + qa_64f_convert_32f_aligned16.cc \ + qa_64f_convert_32f_unaligned16.cc \ + qa_8s_convert_16s_aligned16.cc \ + qa_8s_convert_16s_unaligned16.cc \ + qa_8s_convert_32f_aligned16.cc \ + qa_8s_convert_32f_unaligned16.cc \ + qa_32fc_32f_power_32fc_aligned16.cc \ + qa_32f_power_aligned16.cc \ + qa_32fc_atan2_32f_aligned16.cc \ + qa_32fc_power_spectral_density_32f_aligned16.cc \ + qa_32fc_power_spectrum_32f_aligned16.cc \ + qa_32f_calc_spectral_noise_floor_aligned16.cc \ + qa_32f_accumulator_aligned16.cc \ + qa_32f_stddev_aligned16.cc \ + qa_32f_stddev_and_mean_aligned16.cc + +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_qa_la_LIBADD = \ + libvolk.la \ + libvolk_runtime.la \ + $(CPPUNIT_LIBS) + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_volk.h \ + qa_16s_quad_max_star_aligned16.h \ + qa_32fc_dot_prod_aligned16.h \ + qa_32fc_square_dist_aligned16.h \ + qa_32fc_square_dist_scalar_mult_aligned16.h \ + qa_32f_sum_of_poly_aligned16.h \ + qa_32fc_index_max_aligned16.h \ + qa_32f_index_max_aligned16.h \ + qa_32fc_conjugate_dot_prod_aligned16.h \ + qa_16s_permute_and_scalar_add_aligned16.h \ + qa_16s_branch_4_state_8_aligned16.h \ + qa_16s_max_star_horizontal_aligned16.h \ + qa_16s_max_star_aligned16.h \ + qa_16s_add_quad_aligned16.h \ + qa_32f_add_aligned16.h \ + qa_32f_subtract_aligned16.h \ + qa_32f_max_aligned16.h \ + qa_32f_min_aligned16.h \ + qa_64f_max_aligned16.h \ + qa_64f_min_aligned16.h \ + qa_32s_and_aligned16.h \ + qa_32s_or_aligned16.h \ + qa_32f_dot_prod_aligned16.h \ + qa_32f_dot_prod_unaligned16.h \ + qa_32f_fm_detect_aligned16.h \ + qa_32fc_32f_multiply_aligned16.h \ + qa_32fc_multiply_aligned16.h \ + qa_32f_divide_aligned16.h \ + qa_32f_multiply_aligned16.h \ + qa_32f_sqrt_aligned16.h \ + qa_8sc_multiply_conjugate_16sc_aligned16.h \ + qa_8sc_multiply_conjugate_32fc_aligned16.h \ + qa_32u_popcnt_aligned16.h \ + qa_64u_popcnt_aligned16.h \ + qa_64u_byteswap_aligned16.h \ + qa_8sc_deinterleave_32f_aligned16.h \ + qa_16sc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_16s_aligned16.h \ + qa_32f_interleave_32fc_aligned16.h \ + qa_16u_byteswap_aligned16.h \ + qa_16sc_deinterleave_16s_aligned16.h \ + qa_32fc_deinterleave_real_32f_aligned16.h \ + qa_32fc_magnitude_32f_aligned16.h \ + qa_32fc_deinterleave_real_64f_aligned16.h \ + qa_32fc_deinterleave_real_16s_aligned16.h \ + qa_32fc_magnitude_16s_aligned16.h \ + qa_32fc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_real_8s_aligned16.h \ + qa_32fc_deinterleave_64f_aligned16.h \ + qa_32f_interleave_16sc_aligned16.h \ + qa_16sc_deinterleave_real_8s_aligned16.h \ + qa_16sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_32f_aligned16.h \ + qa_32u_byteswap_aligned16.h \ + qa_16sc_deinterleave_real_16s_aligned16.h \ + qa_8sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_16s_aligned16.h \ + qa_32f_normalize_aligned16.h \ + qa_8sc_deinterleave_real_16s_aligned16.h \ + qa_16s_convert_32f_aligned16.h \ + qa_16s_convert_32f_unaligned16.h \ + qa_16s_convert_8s_aligned16.h \ + qa_16s_convert_8s_unaligned16.h \ + qa_32f_convert_16s_aligned16.h \ + qa_32f_convert_16s_unaligned16.h \ + qa_32f_convert_32s_aligned16.h \ + qa_32f_convert_32s_unaligned16.h \ + qa_32f_convert_64f_aligned16.h \ + qa_32f_convert_64f_unaligned16.h \ + qa_32f_convert_8s_aligned16.h \ + qa_32f_convert_8s_unaligned16.h \ + qa_32s_convert_32f_aligned16.h \ + qa_32s_convert_32f_unaligned16.h \ + qa_64f_convert_32f_aligned16.h \ + qa_64f_convert_32f_unaligned16.h \ + qa_8s_convert_16s_aligned16.h \ + qa_8s_convert_16s_unaligned16.h \ + qa_8s_convert_32f_aligned16.h \ + qa_8s_convert_32f_unaligned16.h \ + qa_32fc_32f_power_32fc_aligned16.h \ + qa_32f_power_aligned16.h \ + qa_32fc_atan2_32f_aligned16.h \ + qa_32fc_power_spectral_density_32f_aligned16.h \ + qa_32fc_power_spectrum_32f_aligned16.h \ + qa_32f_calc_spectral_noise_floor_aligned16.h \ + qa_32f_accumulator_aligned16.h \ + qa_32f_stddev_aligned16.h \ + qa_32f_stddev_and_mean_aligned16.h + + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + test_all + +test_all_SOURCES = test_all.cc +test_all_LDADD = libvolk_qa.la + + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +#SUBDIRS = + +#ifdef BUILD_SSE +#SUBDIRS += sse +#elif BUILD_SPU +#SUBDIRS += spu +#else +#SUBDIRS += port +#endif + + diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h new file mode 100644 index 000000000..8a99aa07c --- /dev/null +++ b/volk/lib/assembly.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef _ASSEMBLY_H_ +#define _ASSEMBLY_H_ + +#if defined (__APPLE__) && defined (__APPLE_CC__) + +// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3, +// but creates an error in XCode 2.4. Just ignore them. + +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) /* none */ + +#define FUNC_TAIL(f) /* none*/ + +#elif !defined (__ELF__) + +/* + * Too bad, the following define does not work as expected --SF + * #define GLOB_SYMB(f) __USER_LABEL_PREFIX__ ## f + */ +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) \ + .def GLOB_SYMB(f); .scl 2; .type 32; .endef + +#define FUNC_TAIL(f) /* none */ + + +#else /* !__ELF__ */ + + +#define GLOB_SYMB(f) f + +#define DEF_FUNC_HEAD(f) \ + .type GLOB_SYMB(f),@function \ + +#define FUNC_TAIL(f) \ + .Lfe1: \ + .size GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f) + + +#endif /* !__ELF__ */ + + +#endif /* _ASSEMBLY_H_ */ diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S new file mode 100644 index 000000000..4e1a9404f --- /dev/null +++ b/volk/lib/cpuid_x86.S @@ -0,0 +1,60 @@ +# +# Copyright 2003 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + pushl %ebp + movl %esp, %ebp + pushl %ebx # must save in PIC mode, holds GOT pointer + pushl %esi + + movl 8(%ebp), %eax # op + movl 12(%ebp), %esi # result + cpuid + movl %eax, 0(%esi) + movl %ebx, 4(%esi) + movl %ecx, 8(%esi) + movl %edx, 12(%esi) + + popl %esi + popl %ebx + popl %ebp + ret + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S new file mode 100644 index 000000000..32b1847cd --- /dev/null +++ b/volk/lib/cpuid_x86_64.S @@ -0,0 +1,54 @@ +# +# Copyright 2003,2005 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86_64.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + mov %rbx, %r11 # must save in PIC mode, holds GOT pointer + + mov %rdi, %rax # op + cpuid + movl %eax, 0(%rsi) # result + movl %ebx, 4(%rsi) + movl %ecx, 8(%rsi) + movl %edx, 12(%rsi) + + mov %r11, %rbx + retq + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid64 assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..c3005c1be --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + short input4[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + short output2[vlen] __attribute__ ((aligned (16))); + short output3[vlen] __attribute__ ((aligned (16))); + short output01[vlen] __attribute__ ((aligned (16))); + short output11[vlen] __attribute__ ((aligned (16))); + short output21[vlen] __attribute__ ((aligned (16))); + short output31[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include +#include + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..ba5e8ed93 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short target3[vlen] __attribute__ ((aligned (16))); + + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))) = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + short cntl0[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl1[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl2[vlen] __attribute__ ((aligned (16))) = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + short cntl3[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include +#include + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc new file mode 100644 index 000000000..7878d4737 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.cc @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h new file mode 100644 index 000000000..ef813d96f --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..8c3121e5c --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_unaligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aeb04f770 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc new file mode 100644 index 000000000..734b7784e --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h new file mode 100644 index 000000000..2e409d0cc --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H + +#include +#include + +class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc new file mode 100644 index 000000000..275ab7668 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h new file mode 100644 index 000000000..4b2fe9e42 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H + +#include +#include + +class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc new file mode 100644 index 000000000..b46b9ae8e --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.cc @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + + +void qa_16s_max_star_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 6400; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[1] __attribute__ ((aligned (16))); + + short output1[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h new file mode 100644 index 000000000..119f87c4d --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H + +#include +#include + +class qa_16s_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc new file mode 100644 index 000000000..4d44735df --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_horizontal_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + +void qa_16s_max_star_horizontal_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 32; + const int ITERS = 1; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[vlen>>1] __attribute__ ((aligned (16))); + + short output1[vlen>>1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))); + + short minus0 = ((short) (rand() - (RAND_MAX/2))); + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_horizontal_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + /* volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/ + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < (vlen >> 1); ++i) { + // printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + + } + for(int i = 0; i < (vlen >> 1); ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } + } + + +#endif + diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h new file mode 100644 index 000000000..9f9757253 --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H + +#include +#include + +class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..3c4f5c6cc --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))); + short cntl0[vlen] __attribute__ ((aligned (16))); + short cntl1[vlen] __attribute__ ((aligned (16))); + short cntl2[vlen] __attribute__ ((aligned (16))); + short cntl3[vlen] __attribute__ ((aligned (16))); + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include +#include + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..80a220c93 --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include +#include + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..e700ac72c --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_sse21[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse31[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse21[i]); + + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..995ab5b34 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..6ee076998 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16sc_deinterleave_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + float output_sse21[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..fea3b6c2d --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..ca048ea67 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,71 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..ebb70b97a --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..0f4ba6923 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE */ + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE4_1 */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..e83426473 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..5ab458bc9 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..04e5511e5 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..b14610757 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..4664b70f4 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..06dff2fd5 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* inputLoad = (int16_t*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("16sc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..0c25673ea --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc new file mode 100644 index 000000000..6b19828a4 --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint16_t output0[vlen] __attribute__ ((aligned (16))); + uint16_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint16_t)); + + printf("16u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h new file mode 100644 index 000000000..e11b23e3f --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_16u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc new file mode 100644 index 000000000..ea637d600 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.cc @@ -0,0 +1,56 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_accumulator_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_accumulator_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float accumulator_generic; + float accumulator_sse; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_accumulator_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4); +} + +#endif diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h new file mode 100644 index 000000000..0004d3ff0 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H +#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H + +#include +#include + +class qa_32f_accumulator_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc new file mode 100644 index 000000000..92f35c7ec --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_add_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_add_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_add_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h new file mode 100644 index 000000000..58e2a151c --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H +#define INCLUDED_QA_32F_ADD_ALIGNED16_H + +#include +#include + +class qa_32f_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc new file mode 100644 index 000000000..3c8137004 --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[1] __attribute__ ((aligned (16))); + float output01[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_calc_spectral_noise_floor_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h new file mode 100644 index 000000000..c5dce2c4b --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H +#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H + +#include +#include + +class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc new file mode 100644 index 000000000..84a4c40c4 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h new file mode 100644 index 000000000..fce1eb417 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc new file mode 100644 index 000000000..9469daed2 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h new file mode 100644 index 000000000..492bc80e6 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc new file mode 100644 index 000000000..ff24c7b0d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_32s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h new file mode 100644 index 000000000..97d854463 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc new file mode 100644 index 000000000..e63b17994 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_32s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h new file mode 100644 index 000000000..5d662d86d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc new file mode 100644 index 000000000..c546e47de --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h new file mode 100644 index 000000000..41eb3e094 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H + +#include +#include + +class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc new file mode 100644 index 000000000..24b51f9af --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_64f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h new file mode 100644 index 000000000..4b144f033 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc new file mode 100644 index 000000000..a3d4d6567 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h new file mode 100644 index 000000000..68a523f34 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc new file mode 100644 index 000000000..d885fd6bb --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h new file mode 100644 index 000000000..88d4ff42a --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc new file mode 100644 index 000000000..b20999beb --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_divide_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_divide_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_divide_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h new file mode 100644 index 000000000..79d5ae4b8 --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H +#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H + +#include +#include + +class qa_32f_divide_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc new file mode 100644 index 000000000..98c1f2d99 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.cc @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_aligned16::t1() { + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h new file mode 100644 index 000000000..6931a9e98 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc new file mode 100644 index 000000000..8e97d4249 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.cc @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_unaligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + input = &input[1]; // Make sure the buffer is unaligned + taps = &taps[1]; // Make sure the buffer is unaligned + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(&input[-1]); + free(&taps[-1]); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h new file mode 100644 index 000000000..e8bad07fe --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H + +#include +#include + +class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..ca65add28 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include +#include + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..a1c3d4cd1 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc new file mode 100644 index 000000000..2a937637f --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32f_interleave_16sc_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_16sc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex output_generic[vlen] __attribute__ ((aligned (16))); + std::complex output_sse[vlen] __attribute__ ((aligned (16))); + std::complex output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_interleave_16sc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h new file mode 100644 index 000000000..8d2914817 --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H + +#include +#include + +class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc new file mode 100644 index 000000000..c22dd1046 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc @@ -0,0 +1,62 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_interleave_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_32fc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex output_generic[vlen] __attribute__ ((aligned (16))); + std::complex output_sse[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_interleave_32fc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h new file mode 100644 index 000000000..cba518d37 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H + +#include +#include + +class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc new file mode 100644 index 000000000..3ef375176 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_max_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h new file mode 100644 index 000000000..d535479f4 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_MAX_ALIGNED16_H + +#include +#include + +class qa_32f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc new file mode 100644 index 000000000..617e18b24 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_min_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h new file mode 100644 index 000000000..90961ac92 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H +#define INCLUDED_QA_32F_MIN_ALIGNED16_H + +#include +#include + +class qa_32f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc new file mode 100644 index 000000000..c77fe97da --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_multiply_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_multiply_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_multiply_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h new file mode 100644 index 000000000..7032a2ad4 --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc new file mode 100644 index 000000000..2954fc3ae --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_normalize_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_normalize_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 320001; + const int ITERS = 100; + + float* output0; + float* output01; + ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); + ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); + + for(int i = 0; i < vlen; ++i) { + output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(float)); + printf("32f_normalize_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + // printf("%e...%e\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } + + free(output0); + free(output01); +} + +#endif diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h new file mode 100644 index 000000000..7c421eb82 --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H +#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H + +#include +#include + +class qa_32f_normalize_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc new file mode 100644 index 000000000..1b331daeb --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.cc @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32f_power_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float* input; + int i; + + float* result_generic; + float* result_sse; + float* result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float)); + + random_floats((float*)input, vlen); + + const float power = 3; + + printf("32f_power_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + + for(i = 0; i < vlen; i++){ + //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32f_power_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h new file mode 100644 index 000000000..d45df4e56 --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H +#define INCLUDED_QA_32F_POWER_ALIGNED16_H + +#include +#include + +class qa_32f_power_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_power_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc new file mode 100644 index 000000000..a3e6abc18 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_sqrt_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_sqrt_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + // No reason to test negative numbers because they result in NaN. + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); + } + printf("32f_sqrt_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h new file mode 100644 index 000000000..e4b99d981 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H +#define INCLUDED_QA_32F_SQRT_ALIGNED16_H + +#include +#include + +class qa_32f_sqrt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc new file mode 100644 index 000000000..c0f22cdea --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.cc @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean = 0; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + mean += input0[i]; + } + mean /= static_cast(vlen); + + printf("32f_stddev_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h new file mode 100644 index 000000000..7f8d7a5fc --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H + +#include +#include + +class qa_32f_stddev_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc new file mode 100644 index 000000000..dcad8bcf3 --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_and_mean_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_and_mean_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean_generic; + float mean_sse; + float mean_sse4_1; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_stddev_and_mean_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h new file mode 100644 index 000000000..e08bd249a --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H + +#include +#include + +class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc new file mode 100644 index 000000000..a7e1b5ae3 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_subtract_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_subtract_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_subtract_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h new file mode 100644 index 000000000..97c14f129 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H +#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H + +#include +#include + +class qa_32f_subtract_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc new file mode 100644 index 000000000..494776357 --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.cc @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include + +#define SNR 30.0 +#define CENTER -4.0 +#define CUTOFF -5.595 +#define ERR_DELTA (1e-4) +#define NUM_ITERS 100000 +#define VEC_LEN 64 +static float uniform() { + return ((float) rand() / RAND_MAX); // uniformly (0, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * -SNR/2.0; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32f_sum_of_poly_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32f_sum_of_poly_aligned16::t1(){ + int i = 0; + + volk_environment_init(); + int ret; + + const int vlen = VEC_LEN; + float cutoff = CUTOFF; + + float* center_point_array; + float* target; + float* target_generic; + float* src0 ; + + + ret = posix_memalign((void**)¢er_point_array, 16, 24); + ret = posix_memalign((void**)&target, 16, 4); + ret = posix_memalign((void**)&target_generic, 16, 4); + ret = posix_memalign((void**)&src0, 16, (vlen << 2)); + + + random_floats((float*)src0, vlen); + + float a = (float)CENTER; + float etoa = expf(a); + center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 + + (-4.0 * a * a * a)/24.0 + + (3.0 * a * a)/6.0 + + (-2.0 * a)/2.0 + + (1.0)) * etoa; + center_point_array[1] = (//(-10.0 * a * a * a)/120.0 + + (6.0 * a * a)/24.0 + + (-3.0 * a)/6.0 + + (1.0/2.0)) * etoa; + center_point_array[2] = (//(10.0 * a * a)/120.0 + + (-4.0 * a)/24.0 + + (1.0/6.0)) * etoa; + center_point_array[3] = (//(-5.0 * a)/120.0 + + (1.0/24.0)) * etoa; + //center_point_array[4] = ((1.0)/120.0) * etoa; + center_point_array[4] = (//(a * a * a * a * a)/120.0 + + (a * a * a * a)/24.0 + + (a * a * a)/-6.0 + + (a * a)/2.0 + + -a + 1.0) * etoa; + + printf("32f_sum_of_poly_aligned16\n"); + + clock_t start, end; + double total; + + float my_sum = 0.0; + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + float sum = 0.0; + for(int l = 0; l < vlen; ++l) { + + sum += expf(src0[l]); + + } + my_sum = sum; + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("exp time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + + volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic"); + + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 approx time: %f\n", total); + + + + printf("exp: %f, sse3: %f\n", my_sum, target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA); + + + free(center_point_array); + free(target); + free(target_generic); + free(src0); + + +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h new file mode 100644 index 000000000..67a347f9a --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H +#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H + +#include +#include + +class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc new file mode 100644 index 000000000..4eba0a3cd --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_32f_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + float * taps; + int i; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats(taps, vlen); + + printf("32fc_32f_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_32f_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h new file mode 100644 index 000000000..fc3b3eeb2 --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc new file mode 100644 index 000000000..64ea65da9 --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1.5e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32fc_32f_power_32fc_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + int i; + + std::complex* result_generic; + std::complex* result_sse; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + + const float power = 3.2; + + printf("32fc_32f_power_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32fc_32f_power_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h new file mode 100644 index 000000000..464b7b7cc --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H + +#include +#include + +class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc new file mode 100644 index 000000000..a24382d71 --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_atan2_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_atan2_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_atan2_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h new file mode 100644 index 000000000..9c4dc209a --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc new file mode 100644 index 000000000..497914e0a --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc @@ -0,0 +1,137 @@ +#include +#include +#include +#include + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h new file mode 100644 index 000000000..507b1769b --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..0f5a030f5 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..78660e6ad --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc new file mode 100644 index 000000000..6e051afbc --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_generic1[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + double output_sse21[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h new file mode 100644 index 000000000..f924b9752 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..850518524 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..68b80f27d --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..321deb184 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..765450bb6 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc new file mode 100644 index 000000000..aedb2e387 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h new file mode 100644 index 000000000..3e55fb812 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc new file mode 100644 index 000000000..bcf9ea954 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.cc @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + + + +#if LV_HAVE_SSE3 +void qa_32fc_dot_prod_aligned16::t1() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +void qa_32fc_dot_prod_aligned16::t2() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_32_time: %f\n", total); + + printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t2() { + printf("sse_32 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_64 + +void qa_32fc_dot_prod_aligned16::t3() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_64_time: %f\n", total); + + printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t3() { + printf("sse_64 not available... no test performed\n"); +} + + + +#endif diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h new file mode 100644 index 000000000..4b360db27 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.h @@ -0,0 +1,20 @@ +#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); + void t2 (); + void t3 (); +}; + + +#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..4d83f1639 --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..a4be1616b --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..ffdf1dd9e --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H + +#include +#include + +class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..d69ada408 --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..a2881308c --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc new file mode 100644 index 000000000..e1f7eab3d --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.cc @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include +#include + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + int i; + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h new file mode 100644 index 000000000..c8abaa8fe --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32fc_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..83cdf4b15 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc new file mode 100644 index 000000000..4d1359068 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + + printf("32fc_power_spectrum_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h new file mode 100644 index 000000000..d991223f3 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc new file mode 100644 index 000000000..d9ead8495 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.cc @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 10000000 +#define VEC_LEN 64 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex* src0 ; + std::complex* points; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + + printf("32fc_square_dist_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(; i < vlen; ++i) { + //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h new file mode 100644 index 000000000..9d365d8b0 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H + +#include +#include + +class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc new file mode 100644 index 000000000..f923d1d5c --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA .0001 +#define NUM_ITERS 10000000 +#define VEC_LEN 64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex* src0 ; + std::complex* points; + float scalar; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + random_floats(&scalar, 1); + + printf("32fc_square_dist_scalar_mult_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(i = 0; i < vlen; ++i) { + printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h new file mode 100644 index 000000000..ac4e3c45b --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H + +#include +#include + +class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc new file mode 100644 index 000000000..72d05cf6f --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_and_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_and_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_and_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h new file mode 100644 index 000000000..dfcb47c63 --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H +#define INCLUDED_QA_32S_AND_ALIGNED16_H + +#include +#include + +class qa_32s_and_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_and_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc new file mode 100644 index 000000000..eab3fe016 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h new file mode 100644 index 000000000..efd2a2eea --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..0e504cfa1 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h new file mode 100644 index 000000000..5006f5fd8 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc new file mode 100644 index 000000000..e09dfb91c --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_or_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_or_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_or_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h new file mode 100644 index 000000000..9e949eb52 --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H +#define INCLUDED_QA_32S_OR_ALIGNED16_H + +#include +#include + +class qa_32s_or_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_or_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc new file mode 100644 index 000000000..8b1023876 --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint32_t output0[vlen] __attribute__ ((aligned (16))); + uint32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint32_t)); + printf("32u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h new file mode 100644 index 000000000..47bad4c3d --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_32u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc new file mode 100644 index 000000000..49fcddeb2 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_32u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_32u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint32_t input0 __attribute__ ((aligned (16))); + + uint32_t output0 __attribute__ ((aligned (16))); + uint32_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint32_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("32u_popcnt_aligned\n"); + + start = clock(); + uint32_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_32u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h new file mode 100644 index 000000000..fa1dc1041 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H + +#include +#include + +class qa_32u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc new file mode 100644 index 000000000..0eaebf00a --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h new file mode 100644 index 000000000..95d79f73d --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc new file mode 100644 index 000000000..dcf94bd27 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h new file mode 100644 index 000000000..430327e81 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc new file mode 100644 index 000000000..41ab078b0 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_max_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h new file mode 100644 index 000000000..7cbd4d4c1 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H +#define INCLUDED_QA_64F_MAX_ALIGNED16_H + +#include +#include + +class qa_64f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc new file mode 100644 index 000000000..b4664d065 --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_min_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h new file mode 100644 index 000000000..a0e95395f --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H +#define INCLUDED_QA_64F_MIN_ALIGNED16_H + +#include +#include + +class qa_64f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc new file mode 100644 index 000000000..4f5d4d02b --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint64_t output0[vlen] __attribute__ ((aligned (16))); + uint64_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint64_t)); + printf("64u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h new file mode 100644 index 000000000..a4fa0c983 --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_64u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc new file mode 100644 index 000000000..bce9ff6c2 --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_64u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_64u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint64_t input0 __attribute__ ((aligned (16))); + + uint64_t output0 __attribute__ ((aligned (16))); + uint64_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint64_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("64u_popcnt_aligned\n"); + + start = clock(); + uint64_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_64u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h new file mode 100644 index 000000000..217822d6e --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H + +#include +#include + +class qa_64u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc new file mode 100644 index 000000000..35f08fb81 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h new file mode 100644 index 000000000..38739fc96 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H + +#include +#include + +class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc new file mode 100644 index 000000000..bb326f818 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_unaligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h new file mode 100644 index 000000000..d39fffc35 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H + +#include +#include + +class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc new file mode 100644 index 000000000..522da0b9d --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h new file mode 100644 index 000000000..7f8401d42 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..ea1fb7c74 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_unaligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen+1] __attribute__ ((aligned (16))); + + float output_generic[vlen+1] __attribute__ ((aligned (16))); + float output_sse4_1[vlen+1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%e...%e\n", output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aad2f8c22 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..823e7fe2e --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_11[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse4_11[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..9c99fed70 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..fb580516c --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + float output_sse14_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max((output_generic[i])*1e-4, 1e-4)); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max((output_generic[i])*1e-4, 1e-4)); + } +} + + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..63b5fdadb --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..1cc844b52 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..02050926f --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..10e537cde --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex *input0; + + float* output_generic; + float* output_sse; + float* output_sse4_1; + + ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t)); + ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float)); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0); + } + + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } + + free(input0); + free(output_generic); + free(output_sse); + free(output_sse4_1); +} + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..93338e488 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..d84df8119 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..92fc0dd4a --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc new file mode 100644 index 000000000..d64eac8ce --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t)); + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_16sc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex*)result_generic, (std::complex*)input, (std::complex*)taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex*)result_sse4_1, (std::complex*)input, (std::complex*)taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h new file mode 100644 index 000000000..0e78a5eca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H + +#include +#include + +class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc new file mode 100644 index 000000000..c27f0e0ca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float)); + + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h new file mode 100644 index 000000000..eb9ae309c --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H + +#include +#include + +class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc new file mode 100644 index 000000000..c3c27b69b --- /dev/null +++ b/volk/lib/qa_volk.cc @@ -0,0 +1,211 @@ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/* + * This class gathers together all the test cases for the example + * directory into a single test suite. As you create new test cases, + * add them here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +CppUnit::TestSuite * +qa_volk::suite() +{ + CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); + + s->addTest(qa_16s_quad_max_star_aligned16::suite()); + s->addTest(qa_32fc_dot_prod_aligned16::suite()); + s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); + s->addTest(qa_32fc_square_dist_aligned16::suite()); + s->addTest(qa_32f_sum_of_poly_aligned16::suite()); + s->addTest(qa_32fc_index_max_aligned16::suite()); + s->addTest(qa_32f_index_max_aligned16::suite()); + s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); + s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); + s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); + s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); + s->addTest(qa_16s_max_star_aligned16::suite()); + s->addTest(qa_16s_add_quad_aligned16::suite()); + s->addTest(qa_32f_add_aligned16::suite()); + s->addTest(qa_32f_subtract_aligned16::suite()); + s->addTest(qa_32f_max_aligned16::suite()); + s->addTest(qa_32f_min_aligned16::suite()); + s->addTest(qa_64f_max_aligned16::suite()); + s->addTest(qa_64f_min_aligned16::suite()); + s->addTest(qa_32s_and_aligned16::suite()); + s->addTest(qa_32s_or_aligned16::suite()); + s->addTest(qa_32f_dot_prod_aligned16::suite()); + s->addTest(qa_32f_dot_prod_unaligned16::suite()); + s->addTest(qa_32f_fm_detect_aligned16::suite()); + s->addTest(qa_32fc_32f_multiply_aligned16::suite()); + s->addTest(qa_32fc_multiply_aligned16::suite()); + s->addTest(qa_32f_divide_aligned16::suite()); + s->addTest(qa_32f_multiply_aligned16::suite()); + s->addTest(qa_32f_sqrt_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite()); + s->addTest(qa_32u_popcnt_aligned16::suite()); + s->addTest(qa_64u_popcnt_aligned16::suite()); + s->addTest(qa_16u_byteswap_aligned16::suite()); + s->addTest(qa_32u_byteswap_aligned16::suite()); + s->addTest(qa_64u_byteswap_aligned16::suite()); + s->addTest(qa_32f_normalize_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_16s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_64f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite()); + s->addTest(qa_32fc_magnitude_16s_aligned16::suite()); + s->addTest(qa_32fc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32f_interleave_16sc_aligned16::suite()); + s->addTest(qa_32f_interleave_32fc_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16s_convert_32f_aligned16::suite()); + s->addTest(qa_16s_convert_32f_unaligned16::suite()); + s->addTest(qa_16s_convert_8s_aligned16::suite()); + s->addTest(qa_16s_convert_8s_unaligned16::suite()); + s->addTest(qa_32f_convert_16s_aligned16::suite()); + s->addTest(qa_32f_convert_16s_unaligned16::suite()); + s->addTest(qa_32f_convert_32s_aligned16::suite()); + s->addTest(qa_32f_convert_32s_unaligned16::suite()); + s->addTest(qa_32f_convert_64f_aligned16::suite()); + s->addTest(qa_32f_convert_64f_unaligned16::suite()); + s->addTest(qa_32f_convert_8s_aligned16::suite()); + s->addTest(qa_32f_convert_8s_unaligned16::suite()); + s->addTest(qa_32s_convert_32f_aligned16::suite()); + s->addTest(qa_32s_convert_32f_unaligned16::suite()); + s->addTest(qa_64f_convert_32f_aligned16::suite()); + s->addTest(qa_64f_convert_32f_unaligned16::suite()); + s->addTest(qa_8s_convert_16s_aligned16::suite()); + s->addTest(qa_8s_convert_16s_unaligned16::suite()); + s->addTest(qa_8s_convert_32f_aligned16::suite()); + s->addTest(qa_8s_convert_32f_unaligned16::suite()); + s->addTest(qa_32fc_32f_power_32fc_aligned16::suite()); + s->addTest(qa_32f_power_aligned16::suite()); + s->addTest(qa_32fc_atan2_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite()); + s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite()); + s->addTest(qa_32f_accumulator_aligned16::suite()); + s->addTest(qa_32f_stddev_aligned16::suite()); + s->addTest(qa_32f_stddev_and_mean_aligned16::suite()); + + return s; +} diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h new file mode 100644 index 000000000..43fa7faba --- /dev/null +++ b/volk/lib/qa_volk.h @@ -0,0 +1,36 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU Example Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Example Public License for more details. + * + * You should have received a copy of the GNU Example Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_QA_VOLK_H +#define INCLUDED_QA_VOLK_H + +#include + +//! collect all the tests for the example directory + +class qa_volk { + public: + //! return suite of tests for all of example directory + static CppUnit::TestSuite *suite (); +}; + +#endif /* INCLUDED_QA_VOLK_H */ diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc new file mode 100644 index 000000000..50ac08eab --- /dev/null +++ b/volk/lib/test_all.cc @@ -0,0 +1,82 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002,2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +int +main (int argc, char **argv) +{ + + int opt = 0; + std::string xmlOutputFile(""); + + while( (opt = getopt(argc, argv, "o:")) != -1){ + switch(opt){ + case 'o': + if(optarg){ + xmlOutputFile.assign(optarg); + } + else{ + std::cerr << "No xml file output specified for -o" << std::endl; + exit(EXIT_FAILURE); + } + break; + + default: /* '?' */ + fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n", + argv[0]); + exit(EXIT_FAILURE); + } + + } + + CppUnit::TextUi::TestRunner runner; + + runner.addTest (qa_volk::suite ()); + + bool was_successful = false; + if(!xmlOutputFile.empty()){ + std::ofstream xmlOutput(xmlOutputFile.c_str()); + if(xmlOutput.is_open()){ + runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput)); + + was_successful = runner.run("", false, true, false); + } + xmlOutput.close(); + } + else{ + was_successful = runner.run ("", false); + } + + return was_successful ? 0 : 1; +} diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c new file mode 100644 index 000000000..b1a93db26 --- /dev/null +++ b/volk/lib/volk_rank_archs.c @@ -0,0 +1,13 @@ +#include +#include + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) { + int i = 2; + unsigned int best_val = 0; + for(; i < arch_defs[0] + 1; ++i) { + if((arch_defs[i]&(!arch)) == 0) { + best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + } + } + return best_val; +} diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h new file mode 100644 index 000000000..26b9f7503 --- /dev/null +++ b/volk/lib/volk_rank_archs.h @@ -0,0 +1,14 @@ +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch); + + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ -- cgit From f8b0c86d8a9eb347cb7187e3b01ed46c66de6a64 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:09:35 -0500 Subject: volk: Adding gitignore files. --- volk/lib/.gitignore | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 volk/lib/.gitignore (limited to 'volk/lib') diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore new file mode 100644 index 000000000..573fb1618 --- /dev/null +++ b/volk/lib/.gitignore @@ -0,0 +1,21 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/volk.c +/volk_cpu_generic.c +/volk_cpu_powerpc.c +/volk_cpu_x86.c +/volk_environment_init.c +/volk_init.c +/volk_init.h +/volk_mktables +/volk_mktables.c +/volk_proccpu_sim.c +/volk_runtime.c -- cgit From 74f206edb2c7bfbe010b5a5cbc5fe2f07965c3a6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:29:58 -0500 Subject: volk: Fixing makefiles for dist. Distcheck still failing on other issues now. --- volk/lib/Makefile.am | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 97eb75680..54df42d54 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -61,7 +61,7 @@ universal_CODE = \ volk_environment_init.c generic_CODE = \ - volk_cpu_generic.cc + volk_cpu_generic.c x86_CODE = \ volk_cpu_x86.c @@ -73,7 +73,7 @@ x86_64_SUBCODE = \ cpuid_x86_64.S powerpc_CODE = \ - volk_cpu_powerpc.cc + volk_cpu_powerpc.c if MD_CPU_generic @@ -236,6 +236,7 @@ libvolk_qa_la_LIBADD = \ noinst_HEADERS = \ volk_init.h \ qa_volk.h \ + assembly.h \ qa_16s_quad_max_star_aligned16.h \ qa_32fc_dot_prod_aligned16.h \ qa_32fc_square_dist_aligned16.h \ -- cgit From 46d55649012e4fb2838a6f8e9f3c9226ea8b2d50 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 12:19:28 -0500 Subject: volk: Working on VPATH build issues. Makes it through configure, fails on make. --- volk/lib/Makefile.am | 1 + 1 file changed, 1 insertion(+) (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 54df42d54..4ee934e8b 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -46,6 +46,7 @@ lib_LTLIBRARIES = \ libvolk_runtime.la \ libvolk_qa.la +EXTRA_DIST = volk_mktables.c # ---------------------------------------------------------------- # The main library -- cgit From 1cc88091470dd4654b6936cda92d81841e135209 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:00:38 -0500 Subject: volk: more changes to build system so that VPATH builds properly and project makes distcheck. --- volk/lib/Makefile.am | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 4ee934e8b..7e808695f 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -20,7 +20,9 @@ include $(top_srcdir)/Makefile.common -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS) +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ + -I$(top_builddir)/include \ + $(LV_CXXFLAGS) $(WITH_INCLUDES) # We build 2 libraries and 1 executable here. One library contains @@ -46,7 +48,10 @@ lib_LTLIBRARIES = \ libvolk_runtime.la \ libvolk_qa.la -EXTRA_DIST = volk_mktables.c +EXTRA_DIST = \ + volk_mktables.c \ + volk_rank_archs.h \ + volk_proccpu_sim.c # ---------------------------------------------------------------- # The main library -- cgit From a8f33e1b577342fd8149d9308d474871c44c7d52 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:26:40 -0500 Subject: Removing autotests of volk during make check and distchecks since they take a long time to run. These can be run by hand by executing volk/lib/test_all Also made a comment about needing a possible fix for this makefile. --- volk/lib/Makefile.am | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 7e808695f..a95860d11 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -20,6 +20,10 @@ include $(top_srcdir)/Makefile.common +#FIXME: forcing the top_builddir for distcheck seems like a bit +# of a hack. Figure out the right way to do this to find built +# volk_config.h and volk_tables.h + AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ -I$(top_builddir)/include \ $(LV_CXXFLAGS) $(WITH_INCLUDES) @@ -40,7 +44,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ # list of programs run by "make check" and "make distcheck" -TESTS = test_all +#TESTS = test_all lib_LTLIBRARIES = \ -- cgit From f3c684751dc3da3a06d5960d8b961739bdf0fd12 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:34:29 -0500 Subject: volk: adding generic QA test for 16sc_magnitude_32f. --- volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 42 ++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'volk/lib') diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index 06dff2fd5..2c9e48f6e 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -8,7 +8,47 @@ #ifndef LV_HAVE_SSE3 void qa_16sc_magnitude_32f_aligned16::t1() { - printf("sse3 not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + int16_t* inputLoad = (int16_t*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (int16_t)(rand() - (RAND_MAX/2)); + } + printf("16sc_magnitude_32f_aligned\n"); + + float scale = 32768.0; + for(int i = 0; i < vlen; ++i) { + float re = (float)(input0[i].real())/scale; + float im = (float)(input0[i].imag())/scale; + output_known[i] = sqrt(re*re + im*im); + } + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, scale, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 100; ++i) { + printf("inputs: %d + j%d\n", input0[i].real(), input0[i].imag()); + printf("generic... %f == %f\n", output_generic[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4); + } } #else -- cgit From 31c85c66f38ed304db06e0696b3df1d2407378c8 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:53:05 -0500 Subject: volk: Adding a few more generic-only test cases. --- volk/lib/qa_32f_add_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_divide_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_multiply_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_sqrt_aligned16.cc | 53 +++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 3 deletions(-) (limited to 'volk/lib') diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc index 92f35c7ec..002aebfc9 100644 --- a/volk/lib/qa_32f_add_aligned16.cc +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_add_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] + input1[i]; + } + printf("32f_add_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index b20999beb..8826bf94f 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_divide_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] / input1[i]; + } + printf("32f_divide_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc index c77fe97da..e52748466 100644 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_multiply_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] * input1[i]; + } + printf("32f_multiply_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc index a3e6abc18..9a5f71de0 100644 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -9,6 +28,40 @@ void qa_32f_sqrt_aligned16::t1() { printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + // No reason to test negative numbers because they result in NaN. + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); + output_known[i] = sqrt(input0[i]); + } + printf("32f_sqrt_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f\n", input0[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output_known[i], fabs(output0[i])*1e-4); + } } #else -- cgit From 8375fd6ca2f6e5edb923abe0d6341b6d4d2d1aae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 01:48:17 -0500 Subject: volk: Fixing build system to handle making volk_mktables, volk_tables.h, and volk_config.h instead of a standalone shell script. --- volk/lib/Makefile.am | 1 - 1 file changed, 1 deletion(-) (limited to 'volk/lib') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index a95860d11..814d438fd 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -353,7 +353,6 @@ distclean-local: rm -f volk_cpu_x86.c rm -f volk_init.c rm -f volk_init.h - rm -f volk_mktables rm -f volk_mktables.c rm -f volk_proccpu_sim.c rm -f volk_runtime.c -- cgit From ce3e4c33d170b65cf288faec7d8da6a496eb6101 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 16 Dec 2010 21:33:54 -0500 Subject: Including time header to qa files. --- volk/lib/qa_16s_add_quad_aligned16.cc | 2 +- volk/lib/qa_16s_branch_4_state_8_aligned16.cc | 2 +- volk/lib/qa_16s_convert_32f_aligned16.cc | 1 + volk/lib/qa_16s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_16s_convert_8s_aligned16.cc | 1 + volk/lib/qa_16s_convert_8s_unaligned16.cc | 1 + volk/lib/qa_16s_max_star_aligned16.cc | 2 +- volk/lib/qa_16s_max_star_horizontal_aligned16.cc | 2 +- volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc | 2 +- volk/lib/qa_16s_quad_max_star_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 1 + volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 1 + volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 1 + volk/lib/qa_16u_byteswap_aligned16.cc | 1 + volk/lib/qa_32f_accumulator_aligned16.cc | 1 + volk/lib/qa_32f_add_aligned16.cc | 1 + volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc | 1 + volk/lib/qa_32f_convert_16s_aligned16.cc | 1 + volk/lib/qa_32f_convert_16s_unaligned16.cc | 1 + volk/lib/qa_32f_convert_32s_aligned16.cc | 1 + volk/lib/qa_32f_convert_32s_unaligned16.cc | 1 + volk/lib/qa_32f_convert_64f_aligned16.cc | 1 + volk/lib/qa_32f_convert_64f_unaligned16.cc | 1 + volk/lib/qa_32f_convert_8s_aligned16.cc | 1 + volk/lib/qa_32f_convert_8s_unaligned16.cc | 1 + volk/lib/qa_32f_divide_aligned16.cc | 1 + volk/lib/qa_32f_fm_detect_aligned16.cc | 1 + volk/lib/qa_32f_interleave_16sc_aligned16.cc | 1 + volk/lib/qa_32f_interleave_32fc_aligned16.cc | 1 + volk/lib/qa_32f_max_aligned16.cc | 1 + volk/lib/qa_32f_min_aligned16.cc | 1 + volk/lib/qa_32f_multiply_aligned16.cc | 1 + volk/lib/qa_32f_normalize_aligned16.cc | 1 + volk/lib/qa_32f_sqrt_aligned16.cc | 1 + volk/lib/qa_32f_stddev_aligned16.cc | 1 + volk/lib/qa_32f_stddev_and_mean_aligned16.cc | 1 + volk/lib/qa_32f_subtract_aligned16.cc | 1 + volk/lib/qa_32fc_atan2_32f_aligned16.cc | 1 + volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_64f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc | 1 + volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 1 + volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 1 + volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc | 1 + volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc | 1 + volk/lib/qa_32s_and_aligned16.cc | 1 + volk/lib/qa_32s_convert_32f_aligned16.cc | 1 + volk/lib/qa_32s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_32s_or_aligned16.cc | 1 + volk/lib/qa_32u_byteswap_aligned16.cc | 1 + volk/lib/qa_32u_popcnt_aligned16.cc | 1 + volk/lib/qa_64f_convert_32f_aligned16.cc | 1 + volk/lib/qa_64f_convert_32f_unaligned16.cc | 1 + volk/lib/qa_64f_max_aligned16.cc | 1 + volk/lib/qa_64f_min_aligned16.cc | 1 + volk/lib/qa_64u_byteswap_aligned16.cc | 1 + volk/lib/qa_64u_popcnt_aligned16.cc | 1 + volk/lib/qa_8s_convert_16s_aligned16.cc | 1 + volk/lib/qa_8s_convert_16s_unaligned16.cc | 1 + volk/lib/qa_8s_convert_32f_aligned16.cc | 1 + volk/lib/qa_8s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc | 1 + volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc | 2 +- volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc | 2 +- 75 files changed, 75 insertions(+), 7 deletions(-) (limited to 'volk/lib') diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc index c3005c1be..154aa0f17 100644 --- a/volk/lib/qa_16s_add_quad_aligned16.cc +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for sse2 #ifndef LV_HAVE_SSE2 diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc index ba5e8ed93..62deffaeb 100644 --- a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -1,7 +1,7 @@ #include #include #include -#include +#include //test for ssse3 diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc index 7878d4737..6215f4a64 100644 --- a/volk/lib/qa_16s_convert_32f_aligned16.cc +++ b/volk/lib/qa_16s_convert_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc index 8c3121e5c..46c2e48ac 100644 --- a/volk/lib/qa_16s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc index 734b7784e..8225aa0cf 100644 --- a/volk/lib/qa_16s_convert_8s_aligned16.cc +++ b/volk/lib/qa_16s_convert_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc index 275ab7668..e6ce5030e 100644 --- a/volk/lib/qa_16s_convert_8s_unaligned16.cc +++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc index b46b9ae8e..c6f828ba6 100644 --- a/volk/lib/qa_16s_max_star_aligned16.cc +++ b/volk/lib/qa_16s_max_star_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for ssse3 #ifndef LV_HAVE_SSSE3 diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc index 4d44735df..0a58570e2 100644 --- a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include //test for ssse3 #ifndef LV_HAVE_SSSE3 diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc index 3c4f5c6cc..819b2256b 100644 --- a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for sse2 diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc index 80a220c93..66f8c9afa 100644 --- a/volk/lib/qa_16s_quad_max_star_aligned16.cc +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc index e700ac72c..c775e8596 100644 --- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc index 6ee076998..b25094e89 100644 --- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc index ca048ea67..c67064ea6 100644 --- a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc index 0f4ba6923..f86f03b88 100644 --- a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc index 5ab458bc9..dd446567e 100644 --- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc index b14610757..9799ef43b 100644 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index 2c9e48f6e..1ebe644c5 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc index 6b19828a4..ea117a820 100644 --- a/volk/lib/qa_16u_byteswap_aligned16.cc +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc index ea637d600..0defef283 100644 --- a/volk/lib/qa_32f_accumulator_aligned16.cc +++ b/volk/lib/qa_32f_accumulator_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc index 002aebfc9..f80d562d4 100644 --- a/volk/lib/qa_32f_add_aligned16.cc +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc index 3c8137004..5d6987333 100644 --- a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc index 84a4c40c4..3e2452e68 100644 --- a/volk/lib/qa_32f_convert_16s_aligned16.cc +++ b/volk/lib/qa_32f_convert_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc index 9469daed2..e016b7ff7 100644 --- a/volk/lib/qa_32f_convert_16s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc index ff24c7b0d..abceb52fb 100644 --- a/volk/lib/qa_32f_convert_32s_aligned16.cc +++ b/volk/lib/qa_32f_convert_32s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc index e63b17994..90f84b56f 100644 --- a/volk/lib/qa_32f_convert_32s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc index c546e47de..1d0754ac9 100644 --- a/volk/lib/qa_32f_convert_64f_aligned16.cc +++ b/volk/lib/qa_32f_convert_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc index 24b51f9af..6f7d5066d 100644 --- a/volk/lib/qa_32f_convert_64f_unaligned16.cc +++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc index a3d4d6567..6a53629b5 100644 --- a/volk/lib/qa_32f_convert_8s_aligned16.cc +++ b/volk/lib/qa_32f_convert_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc index d885fd6bb..fbc5c20e6 100644 --- a/volk/lib/qa_32f_convert_8s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index 8826bf94f..3257a3751 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc index ca65add28..592304f83 100644 --- a/volk/lib/qa_32f_fm_detect_aligned16.cc +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc index 2a937637f..a7ae60780 100644 --- a/volk/lib/qa_32f_interleave_16sc_aligned16.cc +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc index c22dd1046..333b6fce8 100644 --- a/volk/lib/qa_32f_interleave_32fc_aligned16.cc +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc index 3ef375176..ceb913cb4 100644 --- a/volk/lib/qa_32f_max_aligned16.cc +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc index 617e18b24..580a60e7d 100644 --- a/volk/lib/qa_32f_min_aligned16.cc +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc index e52748466..0c242b649 100644 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc index 2954fc3ae..1c7b485a6 100644 --- a/volk/lib/qa_32f_normalize_aligned16.cc +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc index 9a5f71de0..62d55767a 100644 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc index c0f22cdea..5934d70df 100644 --- a/volk/lib/qa_32f_stddev_aligned16.cc +++ b/volk/lib/qa_32f_stddev_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc index dcad8bcf3..78c701d78 100644 --- a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc index a7e1b5ae3..ffe4b504c 100644 --- a/volk/lib/qa_32f_subtract_aligned16.cc +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc index a24382d71..c55ab5aa0 100644 --- a/volk/lib/qa_32fc_atan2_32f_aligned16.cc +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc index 497914e0a..2f9a30395 100644 --- a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include #define assertcomplexEqual(expected, actual, delta) \ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc index 0f5a030f5..72e084c05 100644 --- a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc index 6e051afbc..89770c236 100644 --- a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc index 850518524..7472476f7 100644 --- a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc index 321deb184..5cbdc49b3 100644 --- a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc index aedb2e387..4147e30ae 100644 --- a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index a4be1616b..16984e30d 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc index d69ada408..b99f1ddcf 100644 --- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc index 83cdf4b15..a3d0955bd 100644 --- a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse3 diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc index 4d1359068..1444c78a9 100644 --- a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse3 diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc index 72d05cf6f..661801709 100644 --- a/volk/lib/qa_32s_and_aligned16.cc +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc index eab3fe016..07d799809 100644 --- a/volk/lib/qa_32s_convert_32f_aligned16.cc +++ b/volk/lib/qa_32s_convert_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc index 0e504cfa1..2ec610ffb 100644 --- a/volk/lib/qa_32s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc index e09dfb91c..9da2ae344 100644 --- a/volk/lib/qa_32s_or_aligned16.cc +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc index 8b1023876..313c786b6 100644 --- a/volk/lib/qa_32u_byteswap_aligned16.cc +++ b/volk/lib/qa_32u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc index 49fcddeb2..618a82a02 100644 --- a/volk/lib/qa_32u_popcnt_aligned16.cc +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc index 0eaebf00a..7f9c4584a 100644 --- a/volk/lib/qa_64f_convert_32f_aligned16.cc +++ b/volk/lib/qa_64f_convert_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc index dcf94bd27..98aadbf4d 100644 --- a/volk/lib/qa_64f_convert_32f_unaligned16.cc +++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc index 41ab078b0..76e755514 100644 --- a/volk/lib/qa_64f_max_aligned16.cc +++ b/volk/lib/qa_64f_max_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc index b4664d065..4b70d2881 100644 --- a/volk/lib/qa_64f_min_aligned16.cc +++ b/volk/lib/qa_64f_min_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc index 4f5d4d02b..20d012c9e 100644 --- a/volk/lib/qa_64u_byteswap_aligned16.cc +++ b/volk/lib/qa_64u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc index bce9ff6c2..85ef58795 100644 --- a/volk/lib/qa_64u_popcnt_aligned16.cc +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc index 35f08fb81..8dd5f76ca 100644 --- a/volk/lib/qa_8s_convert_16s_aligned16.cc +++ b/volk/lib/qa_8s_convert_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4_1 diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc index bb326f818..12c502d4b 100644 --- a/volk/lib/qa_8s_convert_16s_unaligned16.cc +++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4_1 diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc index 522da0b9d..672f5662f 100644 --- a/volk/lib/qa_8s_convert_32f_aligned16.cc +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4.1 diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc index ea1fb7c74..43468b1b1 100644 --- a/volk/lib/qa_8s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4.1 diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc index 823e7fe2e..94e63e37d 100644 --- a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc index fb580516c..29073eed7 100644 --- a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc index 1cc844b52..4980c982a 100644 --- a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc index 10e537cde..3c3f737a1 100644 --- a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc index d84df8119..a33d1bf30 100644 --- a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc index d64eac8ce..216bf1cef 100644 --- a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc index c27f0e0ca..4c707446e 100644 --- a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ -- cgit