summaryrefslogtreecommitdiff
path: root/volk/include
diff options
context:
space:
mode:
Diffstat (limited to 'volk/include')
-rw-r--r--volk/include/.gitignore10
-rw-r--r--volk/include/Makefile.am23
-rw-r--r--volk/include/volk/.gitignore2
-rw-r--r--volk/include/volk/Makefile.am140
-rw-r--r--volk/include/volk/volk_16i_branch_4_state_8_a.h118
-rw-r--r--volk/include/volk/volk_16i_convert_8i_a.h4
-rw-r--r--volk/include/volk/volk_16i_convert_8i_u.h4
-rw-r--r--volk/include/volk/volk_16i_max_star_16i_a.h68
-rw-r--r--volk/include/volk/volk_16i_max_star_horizontal_16i_a.h76
-rw-r--r--volk/include/volk/volk_16i_permute_and_scalar_add_a.h62
-rw-r--r--volk/include/volk/volk_16i_s32f_convert_32f_a.h8
-rw-r--r--volk/include/volk/volk_16i_s32f_convert_32f_u.h8
-rw-r--r--volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h56
-rw-r--r--volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h22
-rw-r--r--volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h2
-rw-r--r--volk/include/volk/volk_16ic_deinterleave_real_16i_a.h2
-rw-r--r--volk/include/volk/volk_16ic_magnitude_16i_a.h6
-rw-r--r--volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h6
-rw-r--r--volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h10
-rw-r--r--volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h14
-rw-r--r--volk/include/volk/volk_16u_byteswap_a.h4
-rw-r--r--volk/include/volk/volk_32f_accumulator_s32f_a.h6
-rw-r--r--volk/include/volk/volk_32f_convert_64f_a.h6
-rw-r--r--volk/include/volk/volk_32f_convert_64f_u.h6
-rw-r--r--volk/include/volk/volk_32f_index_max_16u_a.h12
-rw-r--r--volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h12
-rw-r--r--volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h8
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_a.h8
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_u.h8
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_a.h12
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_u.h8
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_8i_a.h10
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_8i_u.h10
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_a.h20
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_u.h20
-rw-r--r--volk/include/volk/volk_32f_s32f_power_32f_a.h20
-rw-r--r--volk/include/volk/volk_32f_s32f_stddev_32f_a.h12
-rw-r--r--volk/include/volk/volk_32f_sqrt_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h12
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_u.h10
-rw-r--r--volk/include/volk/volk_32f_x2_divide_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h32
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h32
-rw-r--r--volk/include/volk/volk_32f_x2_interleave_32fc_a.h2
-rw-r--r--volk/include/volk/volk_32f_x2_max_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_x2_min_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_x2_multiply_32f_a.h20
-rw-r--r--volk/include/volk/volk_32f_x2_multiply_32f_u.h20
-rw-r--r--volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h12
-rw-r--r--volk/include/volk/volk_32f_x2_subtract_32f_a.h10
-rw-r--r--volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h70
-rw-r--r--volk/include/volk/volk_32fc_32f_multiply_32fc_a.h16
-rw-r--r--volk/include/volk/volk_32fc_conjugate_32fc_a.h4
-rw-r--r--volk/include/volk/volk_32fc_conjugate_32fc_u.h6
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h4
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h8
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h2
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_real_32f_a.h2
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_real_64f_a.h6
-rw-r--r--volk/include/volk/volk_32fc_index_max_16u_a.h126
-rw-r--r--volk/include/volk/volk_32fc_magnitude_32f_a.h2
-rw-r--r--volk/include/volk/volk_32fc_magnitude_32f_u.h2
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_a.h2
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_u.h2
-rw-r--r--volk/include/volk/volk_32fc_s32f_atan2_32f_a.h20
-rw-r--r--volk/include/volk/volk_32fc_s32f_power_32fc_a.h38
-rw-r--r--volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h24
-rw-r--r--volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h24
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h12
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h12
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h84
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h32
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h126
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_32fc_a.h14
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_32fc_u.h14
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h14
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h14
-rw-r--r--volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h56
-rw-r--r--volk/include/volk/volk_32fc_x2_square_dist_32f_a.h40
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_a.h2
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_u.h2
-rw-r--r--volk/include/volk/volk_32i_x2_and_32i_a.h10
-rw-r--r--volk/include/volk/volk_32i_x2_or_32i_a.h10
-rw-r--r--volk/include/volk/volk_32u_byteswap_a.h6
-rw-r--r--volk/include/volk/volk_64f_convert_32f_a.h6
-rw-r--r--volk/include/volk/volk_64f_convert_32f_u.h6
-rw-r--r--volk/include/volk/volk_64f_x2_max_64f_a.h10
-rw-r--r--volk/include/volk/volk_64f_x2_min_64f_a.h10
-rw-r--r--volk/include/volk/volk_64u_byteswap_a.h18
-rw-r--r--volk/include/volk/volk_64u_popcnt_a.h2
-rw-r--r--volk/include/volk/volk_8i_s32f_convert_32f_a.h2
-rw-r--r--volk/include/volk/volk_8i_s32f_convert_32f_u.h2
-rw-r--r--volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h8
-rw-r--r--volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h10
-rw-r--r--volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h8
-rw-r--r--volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h4
97 files changed, 870 insertions, 1045 deletions
diff --git a/volk/include/.gitignore b/volk/include/.gitignore
deleted file mode 100644
index 378f771f5..000000000
--- a/volk/include/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-/*.cache
-/*.la
-/*.lo
-/*.pc
-/.deps
-/.la
-/.libs
-/.lo
-/Makefile
-/Makefile.in
diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am
deleted file mode 100644
index 375d1a7d5..000000000
--- a/volk/include/Makefile.am
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# Copyright 2008 Free Software Foundation, Inc.
-#
-# This file is part of GNU Radio
-#
-# GNU Radio is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# GNU Radio is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-
-include $(top_srcdir)/Makefile.common
-
-SUBDIRS = volk
diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore
deleted file mode 100644
index b336cc7ce..000000000
--- a/volk/include/volk/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/Makefile
-/Makefile.in
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
deleted file mode 100644
index a01ddf193..000000000
--- a/volk/include/volk/Makefile.am
+++ /dev/null
@@ -1,140 +0,0 @@
-#
-# Copyright 2010,2011 Free Software Foundation, Inc.
-#
-# This file is part of GNU Radio
-#
-# GNU Radio is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# GNU Radio is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-
-include $(top_srcdir)/Makefile.common
-
-AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \
- $(LV_CXXFLAGS) $(WITH_INCLUDES)
-
-volkincludedir = $(prefix)/include/volk
-
-volkinclude_HEADERS = \
- volk_complex.h \
- volk_common.h \
- volk_prefs.h \
- $(top_gendir)/include/volk/volk_config_fixed.h \
- $(top_gendir)/include/volk/volk_typedefs.h \
- $(top_gendir)/include/volk/volk.h \
- $(top_gendir)/include/volk/volk_cpu.h \
- volk_16i_x5_add_quad_16i_x4_a.h \
- volk_16i_branch_4_state_8_a.h \
- volk_16ic_deinterleave_16i_x2_a.h \
- volk_16ic_s32f_deinterleave_32f_x2_a.h \
- volk_16ic_deinterleave_real_16i_a.h \
- volk_16ic_s32f_deinterleave_real_32f_a.h \
- volk_16ic_deinterleave_real_8i_a.h \
- volk_16ic_magnitude_16i_a.h \
- volk_16ic_s32f_magnitude_32f_a.h \
- volk_16i_s32f_convert_32f_a.h \
- volk_16i_s32f_convert_32f_u.h \
- volk_16i_convert_8i_a.h \
- volk_16i_convert_8i_u.h \
- volk_16i_max_star_16i_a.h \
- volk_16i_max_star_horizontal_16i_a.h \
- volk_16i_permute_and_scalar_add_a.h \
- volk_16i_x4_quad_max_star_16i_a.h \
- volk_16u_byteswap_a.h \
- volk_32f_accumulator_s32f_a.h \
- volk_32f_x2_add_32f_a.h \
- volk_32f_x2_add_32f_u.h \
- volk_32f_s32f_multiply_32f_a.h \
- volk_32f_s32f_multiply_32f_u.h \
- volk_32fc_32f_multiply_32fc_a.h \
- volk_32fc_s32fc_multiply_32fc_a.h \
- volk_32fc_s32fc_multiply_32fc_u.h \
- volk_32fc_x2_multiply_conjugate_32fc_a.h \
- volk_32fc_x2_multiply_conjugate_32fc_u.h \
- volk_32fc_s32f_power_32fc_a.h \
- volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \
- volk_32fc_s32f_atan2_32f_a.h \
- volk_32fc_x2_conjugate_dot_prod_32fc_a.h \
- volk_32fc_x2_conjugate_dot_prod_32fc_u.h \
- volk_32fc_deinterleave_32f_x2_a.h \
- volk_32fc_deinterleave_64f_x2_a.h \
- volk_32fc_s32f_deinterleave_real_16i_a.h \
- volk_32fc_deinterleave_real_32f_a.h \
- volk_32fc_deinterleave_imag_32f_a.h \
- volk_32fc_deinterleave_real_64f_a.h \
- volk_32fc_x2_dot_prod_32fc_a.h \
- volk_32fc_x2_dot_prod_32fc_u.h \
- volk_32fc_index_max_16u_a.h \
- volk_32fc_s32f_magnitude_16i_a.h \
- volk_32fc_magnitude_32f_a.h \
- volk_32fc_magnitude_32f_u.h \
- volk_32fc_magnitude_squared_32f_a.h \
- volk_32fc_magnitude_squared_32f_u.h \
- volk_32fc_x2_multiply_32fc_a.h \
- volk_32fc_x2_multiply_32fc_u.h \
- volk_32f_s32f_convert_16i_a.h \
- volk_32f_s32f_convert_16i_u.h \
- volk_32f_s32f_convert_32i_a.h \
- volk_32f_s32f_convert_32i_u.h \
- volk_32f_convert_64f_a.h \
- volk_32f_convert_64f_u.h \
- volk_32f_s32f_convert_8i_a.h \
- volk_32f_s32f_convert_8i_u.h \
- volk_32fc_s32f_x2_power_spectral_density_32f_a.h \
- volk_32fc_s32f_power_spectrum_32f_a.h \
- volk_32fc_x2_square_dist_32f_a.h \
- volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h \
- volk_32f_x2_divide_32f_a.h \
- volk_32f_x2_dot_prod_32f_a.h \
- volk_32f_x2_dot_prod_32f_u.h \
- volk_32f_s32f_32f_fm_detect_32f_a.h \
- volk_32f_index_max_16u_a.h \
- volk_32f_x2_s32f_interleave_16ic_a.h \
- volk_32f_x2_interleave_32fc_a.h \
- volk_32f_x2_max_32f_a.h \
- volk_32f_x2_min_32f_a.h \
- volk_32f_x2_multiply_32f_a.h \
- volk_32f_x2_multiply_32f_u.h \
- volk_32f_s32f_normalize_a.h \
- volk_32f_s32f_power_32f_a.h \
- volk_32f_sqrt_32f_a.h \
- volk_32f_s32f_stddev_32f_a.h \
- volk_32f_stddev_and_mean_32f_x2_a.h \
- volk_32f_x2_subtract_32f_a.h \
- volk_32f_x3_sum_of_poly_32f_a.h \
- volk_32i_x2_and_32i_a.h \
- volk_32i_s32f_convert_32f_a.h \
- volk_32i_s32f_convert_32f_u.h \
- volk_32i_x2_or_32i_a.h \
- volk_32u_byteswap_a.h \
- volk_32u_popcnt_a.h \
- volk_64f_convert_32f_a.h \
- volk_64f_convert_32f_u.h \
- volk_64f_x2_max_64f_a.h \
- volk_64f_x2_min_64f_a.h \
- volk_64u_byteswap_a.h \
- volk_64u_popcnt_a.h \
- volk_8ic_deinterleave_16i_x2_a.h \
- volk_8ic_s32f_deinterleave_32f_x2_a.h \
- volk_8ic_deinterleave_real_16i_a.h \
- volk_8ic_s32f_deinterleave_real_32f_a.h \
- volk_8ic_deinterleave_real_8i_a.h \
- volk_8ic_x2_multiply_conjugate_16ic_a.h \
- volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h \
- volk_8i_convert_16i_a.h \
- volk_8i_convert_16i_u.h \
- volk_8i_s32f_convert_32f_a.h \
- volk_8i_s32f_convert_32f_u.h \
- volk_32fc_conjugate_32fc_a.h \
- volk_32fc_conjugate_32fc_u.h
-
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h
index 0424e66e9..6338fbdd1 100644
--- a/volk/include/volk/volk_16i_branch_4_state_8_a.h
+++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h
@@ -3,7 +3,7 @@
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
@@ -15,32 +15,32 @@
#include<tmmintrin.h>
static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-
-
+
+
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
__m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
-
-
+
+
p_target = (__m128i*)target;
p_src0 = (__m128i*)src0;
p_cntl2 = (__m128i*)cntl2;
p_cntl3 = (__m128i*)cntl3;
p_scalars = (__m128i*)scalars;
-
+
int i = 0;
-
+
int bound = 1;
-
-
+
+
xmm0 = _mm_load_si128(p_scalars);
-
+
xmm1 = _mm_shufflelo_epi16(xmm0, 0);
xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-
+
xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
@@ -52,40 +52,40 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src
xmm10 = _mm_load_si128((__m128i*)permuters[3]);
for(; i < bound; ++i) {
-
+
xmm5 = _mm_load_si128(p_src0);
-
-
-
-
-
-
-
+
+
+
+
+
+
+
xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
-
+
p_src0 += 4;
-
-
+
+
xmm5 = _mm_add_epi16(xmm1, xmm2);
-
+
xmm6 = _mm_add_epi16(xmm2, xmm6);
xmm8 = _mm_add_epi16(xmm1, xmm8);
-
-
+
+
xmm7 = _mm_load_si128(p_cntl2);
xmm9 = _mm_load_si128(p_cntl3);
-
+
xmm0 = _mm_add_epi16(xmm5, xmm0);
-
-
+
+
xmm7 = _mm_and_si128(xmm7, xmm3);
xmm9 = _mm_and_si128(xmm9, xmm4);
-
+
xmm5 = _mm_load_si128(&p_cntl2[1]);
xmm11 = _mm_load_si128(&p_cntl3[1]);
@@ -95,96 +95,96 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src
xmm11 = _mm_and_si128(xmm11, xmm4);
xmm0 = _mm_add_epi16(xmm0, xmm7);
-
-
-
+
+
+
xmm7 = _mm_load_si128(&p_cntl2[2]);
xmm9 = _mm_load_si128(&p_cntl3[2]);
-
+
xmm5 = _mm_add_epi16(xmm5, xmm11);
-
+
xmm7 = _mm_and_si128(xmm7, xmm3);
xmm9 = _mm_and_si128(xmm9, xmm4);
-
+
xmm6 = _mm_add_epi16(xmm6, xmm5);
-
-
+
+
xmm5 = _mm_load_si128(&p_cntl2[3]);
xmm11 = _mm_load_si128(&p_cntl3[3]);
-
+
xmm7 = _mm_add_epi16(xmm7, xmm9);
-
+
xmm5 = _mm_and_si128(xmm5, xmm3);
xmm11 = _mm_and_si128(xmm11, xmm4);
-
+
xmm8 = _mm_add_epi16(xmm8, xmm7);
-
+
xmm5 = _mm_add_epi16(xmm5, xmm11);
-
+
_mm_store_si128(p_target, xmm0);
_mm_store_si128(&p_target[1], xmm6);
xmm10 = _mm_add_epi16(xmm5, xmm10);
-
+
_mm_store_si128(&p_target[2], xmm8);
-
+
_mm_store_si128(&p_target[3], xmm10);
-
- p_target += 3;
+
+ p_target += 3;
}
}
-
-
+
+
#endif /*LV_HAVE_SSEs*/
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
int i = 0;
-
+
int bound = 4;
-
+
for(; i < bound; ++i) {
- target[i* 8] = src0[((char)permuters[i][0])/2]
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8] & scalars[2])
+ (cntl3[i * 8] & scalars[3]);
- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 1] & scalars[2])
+ (cntl3[i * 8 + 1] & scalars[3]);
- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 2] & scalars[2])
+ (cntl3[i * 8 + 2] & scalars[3]);
- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 3] & scalars[2])
+ (cntl3[i * 8 + 3] & scalars[3]);
- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 4] & scalars[2])
+ (cntl3[i * 8 + 4] & scalars[3]);
- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 5] & scalars[2])
+ (cntl3[i * 8 + 5] & scalars[3]);
- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 6] & scalars[2])
+ (cntl3[i * 8 + 6] & scalars[3]);
- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ ((i + 1)%2 * scalars[0])
+ (((i >> 1)^1) * scalars[1])
+ (cntl2[i * 8 + 7] & scalars[2])
+ (cntl3[i * 8 + 7] & scalars[3]);
-
+
}
}
diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h
index 8046035c7..84548c8c5 100644
--- a/volk/include/volk/volk_16i_convert_8i_a.h
+++ b/volk/include/volk/volk_16i_convert_8i_a.h
@@ -15,7 +15,7 @@
static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
int8_t* outputVectorPtr = outputVector;
int16_t* inputPtr = (int16_t*)inputVector;
__m128i inputVal1;
@@ -30,7 +30,7 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_
inputVal1 = _mm_srai_epi16(inputVal1, 8);
inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
+
ret = _mm_packs_epi16(inputVal1, inputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, ret);
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h
index df1084fe0..80608a141 100644
--- a/volk/include/volk/volk_16i_convert_8i_u.h
+++ b/volk/include/volk/volk_16i_convert_8i_u.h
@@ -16,7 +16,7 @@
static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
int8_t* outputVectorPtr = outputVector;
int16_t* inputPtr = (int16_t*)inputVector;
__m128i inputVal1;
@@ -31,7 +31,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_
inputVal1 = _mm_srai_epi16(inputVal1, 8);
inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
+
ret = _mm_packs_epi16(inputVal1, inputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, ret);
diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h
index 28197ddef..edfff8a82 100644
--- a/volk/include/volk/volk_16i_max_star_16i_a.h
+++ b/volk/include/volk/volk_16i_max_star_16i_a.h
@@ -3,7 +3,7 @@
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
#ifdef LV_HAVE_SSSE3
@@ -15,82 +15,82 @@
static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
+
short candidate = src0[0];
short cands[8];
__m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
-
+
__m128i *p_src0;
-
+
p_src0 = (__m128i*)src0;
int bound = num_bytes >> 4;
int leftovers = (num_bytes >> 1) & 7;
-
+
int i = 0;
-
-
+
+
xmm1 = _mm_setzero_si128();
xmm0 = _mm_setzero_si128();
//_mm_insert_epi16(xmm0, candidate, 0);
-
- xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
-
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+
for(i = 0; i < bound; ++i) {
xmm1 = _mm_load_si128(p_src0);
p_src0 += 1;
//xmm2 = _mm_sub_epi16(xmm1, xmm0);
-
-
-
-
-
+
+
+
+
+
xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
xmm6 = _mm_xor_si128(xmm4, xmm5);
-
+
xmm3 = _mm_and_si128(xmm3, xmm0);
xmm4 = _mm_and_si128(xmm6, xmm1);
-
+
xmm0 = _mm_add_epi16(xmm3, xmm4);
-
-
+
+
}
-
+
_mm_store_si128((__m128i*)cands, xmm0);
-
+
for(i = 0; i < 8; ++i) {
candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
}
-
-
-
+
+
+
for(i = 0; i < leftovers; ++i) {
-
+
candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
}
target[0] = candidate;
-
-
-
-
-}
-
+
+
+
+
+}
+
#endif /*LV_HAVE_SSSE3*/
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) {
-
+
int i = 0;
-
+
int bound = num_bytes >> 1;
short candidate = src0[0];
@@ -98,7 +98,7 @@ static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, u
candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
}
target[0] = candidate;
-
+
}
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
index a10a62350..c1c908425 100644
--- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
+++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
@@ -4,7 +4,7 @@
#include <volk/volk_common.h>
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
#ifdef LV_HAVE_SSSE3
@@ -20,107 +20,107 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
-
-
+
+
__m128i xmm0, xmm1, xmm2, xmm3, xmm4;
__m128i xmm5, xmm6, xmm7, xmm8;
-
+
xmm4 = _mm_load_si128((__m128i*)shufmask0);
xmm5 = _mm_load_si128((__m128i*)shufmask1);
xmm6 = _mm_load_si128((__m128i*)andmask0);
xmm7 = _mm_load_si128((__m128i*)andmask1);
-
+
__m128i *p_target, *p_src0;
-
+
p_target = (__m128i*)target;
p_src0 = (__m128i*)src0;
int bound = num_bytes >> 5;
int intermediate = (num_bytes >> 4) & 1;
int leftovers = (num_bytes >> 1) & 7;
-
+
int i = 0;
-
-
+
+
for(i = 0; i < bound; ++i) {
-
+
xmm0 = _mm_load_si128(p_src0);
xmm1 = _mm_load_si128(&p_src0[1]);
-
-
+
+
xmm2 = _mm_xor_si128(xmm2, xmm2);
p_src0 += 2;
-
+
xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-
- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
xmm8 = _mm_and_si128(xmm2, xmm6);
xmm3 = _mm_and_si128(xmm2, xmm7);
-
+
xmm8 = _mm_add_epi8(xmm8, xmm4);
xmm3 = _mm_add_epi8(xmm3, xmm5);
xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
-
-
+
+
xmm3 = _mm_add_epi16(xmm0, xmm1);
-
+
_mm_store_si128(p_target, xmm3);
-
+
p_target += 1;
-
+
}
for(i = 0; i < intermediate; ++i) {
-
+
xmm0 = _mm_load_si128(p_src0);
-
-
+
+
xmm2 = _mm_xor_si128(xmm2, xmm2);
p_src0 += 1;
-
+
xmm3 = _mm_hsub_epi16(xmm0, xmm1);
xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
xmm8 = _mm_and_si128(xmm2, xmm6);
-
+
xmm3 = _mm_add_epi8(xmm8, xmm4);
-
+
xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-
+
_mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
-
+
p_target = (__m128i*)((int8_t*)p_target + 8);
}
-
- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
}
-
-}
-
+
+}
+
#endif /*LV_HAVE_SSSE3*/
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
+
int i = 0;
-
+
int bound = num_bytes >> 1;
-
+
for(i = 0; i < bound; i += 2) {
target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
}
-
+
}
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
index de36cee80..47e3cbf9c 100644
--- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
+++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
@@ -3,7 +3,7 @@
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
@@ -14,33 +14,33 @@
#include<emmintrin.h>
static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-
+
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
__m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
short* p_permute_indexes = permute_indexes;
-
+
p_target = (__m128i*)target;
p_cntl0 = (__m128i*)cntl0;
p_cntl1 = (__m128i*)cntl1;
p_cntl2 = (__m128i*)cntl2;
p_cntl3 = (__m128i*)cntl3;
p_scalars = (__m128i*)scalars;
-
+
int i = 0;
-
+
int bound = (num_bytes >> 4);
int leftovers = (num_bytes >> 1) & 7;
-
+
xmm0 = _mm_load_si128(p_scalars);
-
+
xmm1 = _mm_shufflelo_epi16(xmm0, 0);
xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-
+
xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
@@ -64,49 +64,49 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short
xmm0 = _mm_add_epi16(xmm0, xmm5);
xmm6 = _mm_add_epi16(xmm6, xmm7);
-
+
p_permute_indexes += 8;
-
+
xmm0 = _mm_add_epi16(xmm0, xmm6);
-
+
xmm5 = _mm_load_si128(p_cntl0);
xmm6 = _mm_load_si128(p_cntl1);
xmm7 = _mm_load_si128(p_cntl2);
-
+
xmm5 = _mm_and_si128(xmm5, xmm1);
xmm6 = _mm_and_si128(xmm6, xmm2);
xmm7 = _mm_and_si128(xmm7, xmm3);
-
+
xmm0 = _mm_add_epi16(xmm0, xmm5);
-
+
xmm5 = _mm_load_si128(p_cntl3);
-
+
xmm6 = _mm_add_epi16(xmm6, xmm7);
p_cntl0 += 1;
-
+
xmm5 = _mm_and_si128(xmm5, xmm4);
-
+
xmm0 = _mm_add_epi16(xmm0, xmm6);
-
+
p_cntl1 += 1;
p_cntl2 += 1;
-
- xmm0 = _mm_add_epi16(xmm0, xmm5);
-
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
p_cntl3 += 1;
_mm_store_si128(p_target, xmm0);
-
+
p_target += 1;
}
-
-
-
-
+
+
+
+
for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- target[i] = src0[permute_indexes[i]]
+ target[i] = src0[permute_indexes[i]]
+ (cntl0[i] & scalars[0])
+ (cntl1[i] & scalars[1])
+ (cntl2[i] & scalars[2])
@@ -118,18 +118,18 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-
+
int i = 0;
-
+
int bound = num_bytes >> 1;
for(i = 0; i < bound; ++i) {
- target[i] = src0[permute_indexes[i]]
+ target[i] = src0[permute_indexes[i]]
+ (cntl0[i] & scalars[0])
+ (cntl1[i] & scalars[1])
+ (cntl2[i] & scalars[2])
+ (cntl3[i] & scalars[3]);
-
+
}
}
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
index 0555fdf00..7108ff659 100644
--- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
@@ -17,7 +17,7 @@
static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
-
+
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
@@ -36,7 +36,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const
// Convert the lower 4 values into 32 bit words
inputVal = _mm_cvtepi16_epi32(inputVal);
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
+
ret = _mm_cvtepi32_ps(inputVal);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
@@ -71,7 +71,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const
static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
@@ -79,7 +79,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const in
for(;number < quarterPoints; number++){
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
+
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
index d34acc091..4ce8e8f35 100644
--- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
@@ -18,7 +18,7 @@
static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
-
+
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
@@ -37,7 +37,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const
// Convert the lower 4 values into 32 bit words
inputVal = _mm_cvtepi16_epi32(inputVal);
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
+
ret = _mm_cvtepi32_ps(inputVal);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
@@ -73,7 +73,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const
static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
@@ -81,7 +81,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in
for(;number < quarterPoints; number++){
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
+
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
index 2688aff04..0d8498553 100644
--- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
+++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
@@ -3,7 +3,7 @@
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
@@ -14,7 +14,7 @@
#include<emmintrin.h>
static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-
+
@@ -23,41 +23,41 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
int bound = (num_bytes >> 4);
int bound_copy = bound;
int leftovers = (num_bytes >> 1) & 7;
-
+
__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
p_target = (__m128i*) target;
p_src0 = (__m128i*)src0;
p_src1 = (__m128i*)src1;
p_src2 = (__m128i*)src2;
p_src3 = (__m128i*)src3;
-
-
+
+
__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
while(bound_copy > 0) {
-
+
xmm1 = _mm_load_si128(p_src0);
xmm2 = _mm_load_si128(p_src1);
xmm3 = _mm_load_si128(p_src2);
xmm4 = _mm_load_si128(p_src3);
-
+
xmm5 = _mm_setzero_si128();
xmm6 = _mm_setzero_si128();
xmm7 = xmm1;
xmm8 = xmm3;
-
-
+
+
xmm1 = _mm_sub_epi16(xmm2, xmm1);
-
+
xmm3 = _mm_sub_epi16(xmm4, xmm3);
xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
-
+
xmm2 = _mm_and_si128(xmm5, xmm2);
xmm4 = _mm_and_si128(xmm6, xmm4);
@@ -67,7 +67,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
xmm5 = _mm_add_epi16(xmm2, xmm5);
xmm6 = _mm_add_epi16(xmm4, xmm6);
-
+
xmm1 = _mm_xor_si128(xmm1, xmm1);
xmm2 = xmm5;
xmm5 = _mm_sub_epi16(xmm6, xmm5);
@@ -76,23 +76,23 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
p_src1 += 1;
-
+
xmm6 = _mm_and_si128(xmm1, xmm6);
-
+
xmm1 = _mm_andnot_si128(xmm1, xmm2);
p_src2 += 1;
-
+
xmm1 = _mm_add_epi16(xmm6, xmm1);
p_src3 += 1;
-
+
_mm_store_si128(p_target, xmm1);
p_target += 1;
-
+
}
-
+
/*asm volatile
(
@@ -111,25 +111,25 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
"movaps %%xmm3, %%xmm8\n\t"
"psubw %%xmm2, %%xmm1\n\t"
"psubw %%xmm4, %%xmm3\n\t"
-
+
"pcmpgtw %%xmm1, %%xmm5\n\t"
"pcmpgtw %%xmm3, %%xmm6\n\t"
-
+
"pand %%xmm5, %%xmm2\n\t"
"pand %%xmm6, %%xmm4\n\t"
"pandn %%xmm7, %%xmm5\n\t"
"pandn %%xmm8, %%xmm6\n\t"
-
+
"paddw %%xmm2, %%xmm5\n\t"
"paddw %%xmm4, %%xmm6\n\t"
"pxor %%xmm1, %%xmm1\n\t"
"movaps %%xmm5, %%xmm2\n\t"
-
+
"psubw %%xmm6, %%xmm5\n\t"
"add $16, %[src0]\n\t"
"add $-1, %[bound]\n\t"
-
+
"pcmpgtw %%xmm5, %%xmm1\n\t"
"add $16, %[src1]\n\t"
@@ -144,13 +144,13 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
"movaps %%xmm1, (%[target])\n\t"
"addw $16, %[target]\n\t"
"jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
-
+
"volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
:
:[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
:
);
- */
+ */
short temp0 = 0;
short temp1 = 0;
@@ -169,11 +169,11 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-
+
int i = 0;
-
+
int bound = num_bytes >> 1;
-
+
short temp0 = 0;
short temp1 = 0;
for(i = 0; i < bound; ++i) {
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
index e4c9f17ed..5560b92d9 100644
--- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
+++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
@@ -3,7 +3,7 @@
#include<inttypes.h>
-#include<stdio.h>
+#include<stdio.h>
@@ -14,7 +14,7 @@
#include<emmintrin.h>
static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-
+
__m128i xmm0, xmm1, xmm2, xmm3, xmm4;
__m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
p_target0 = (__m128i*)target0;
@@ -39,16 +39,16 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
xmm2 = _mm_load_si128(p_src2);
xmm3 = _mm_load_si128(p_src3);
xmm4 = _mm_load_si128(p_src4);
-
+
p_src0 += 1;
p_src1 += 1;
-
+
xmm1 = _mm_add_epi16(xmm0, xmm1);
xmm2 = _mm_add_epi16(xmm0, xmm2);
xmm3 = _mm_add_epi16(xmm0, xmm3);
xmm4 = _mm_add_epi16(xmm0, xmm4);
-
-
+
+
p_src2 += 1;
p_src3 += 1;
p_src4 += 1;
@@ -57,7 +57,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
_mm_store_si128(p_target1, xmm2);
_mm_store_si128(p_target2, xmm3);
_mm_store_si128(p_target3, xmm4);
-
+
p_target0 += 1;
p_target1 += 1;
p_target2 += 1;
@@ -97,9 +97,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
:[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
:"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
-
+
*/
-
+
for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
target0[i] = src0[i] + src1[i];
@@ -114,9 +114,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-
+
int i = 0;
-
+
int bound = num_bytes >> 1;
for(i = 0; i < bound; ++i) {
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
index cdd60235e..f8aa30874 100644
--- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
+++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
@@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_
__m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
unsigned int eighthPoints = num_points / 8;
-
+
for(number = 0; number < eighthPoints; number++){
complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
index 2b99e068e..bac1f2e4b 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
+++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
@@ -64,7 +64,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons
__m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
unsigned int eighthPoints = num_points / 8;
-
+
for(number = 0; number < eighthPoints; number++){
complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h
index a6951e967..317075e85 100644
--- a/volk/include/volk/volk_16ic_magnitude_16i_a.h
+++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h
@@ -17,7 +17,7 @@
static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
int16_t* magnitudeVectorPtr = magnitudeVector;
@@ -35,7 +35,7 @@ static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, cons
inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
@@ -106,7 +106,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const
inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
cplxValue1 = _mm_load_ps(inputFloatBuffer);
complexVectorPtr += 4;
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
index e73d405e0..1300395ff 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
@@ -20,7 +20,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa
float* qBufferPtr = qBuffer;
uint64_t number = 0;
- const uint64_t quarterPoints = num_points / 4;
+ const uint64_t quarterPoints = num_points / 4;
__m128 cplxValue1, cplxValue2, iValue, qValue;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
@@ -29,12 +29,12 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa
__VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
for(;number < quarterPoints; number++){
-
+
floatBuffer[0] = (float)(complexVectorPtr[0]);
floatBuffer[1] = (float)(complexVectorPtr[1]);
floatBuffer[2] = (float)(complexVectorPtr[2]);
floatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
floatBuffer[4] = (float)(complexVectorPtr[4]);
floatBuffer[5] = (float)(complexVectorPtr[5]);
floatBuffer[6] = (float)(complexVectorPtr[6]);
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
index 1630cb0ed..5e2d82b94 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
@@ -18,7 +18,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
float* iBufferPtr = iBuffer;
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 iFloatValue;
@@ -49,7 +49,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
*iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
sixteenTComplexVectorPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE4_1 */
@@ -66,7 +66,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co
float* iBufferPtr = iBuffer;
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 iValue;
const float iScalar = 1.0/scalar;
@@ -77,7 +77,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co
for(;number < quarterPoints; number++){
floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
@@ -96,7 +96,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co
*iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
complexVectorPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE */
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
index 35406e2cb..d20eea1a7 100644
--- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
+++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
@@ -18,7 +18,7 @@
static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
@@ -34,7 +34,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c
inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
@@ -56,7 +56,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c
result = _mm_sqrt_ps(result); // Square root the values
_mm_store_ps(magnitudeVectorPtr, result);
-
+
magnitudeVectorPtr += 4;
}
@@ -99,7 +99,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
@@ -107,7 +107,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
+
re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
@@ -124,7 +124,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
result = _mm_sqrt_ps(result); // Square root the values
_mm_store_ps(magnitudeVectorPtr, result);
-
+
magnitudeVectorPtr += 4;
}
@@ -138,7 +138,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
}
}
-
+
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h
index 75c7ef0f3..fc3eb5fa7 100644
--- a/volk/include/volk/volk_16u_byteswap_a.h
+++ b/volk/include/volk/volk_16u_byteswap_a.h
@@ -31,9 +31,9 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n
inputPtr += 8;
}
-
+
// Byteswap any remaining points:
- number = eighthPoints*8;
+ number = eighthPoints*8;
for(; number < num_points; number++){
uint16_t outputVal = *inputPtr;
outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h
index 7ce0d1c80..78364d0a0 100644
--- a/volk/include/volk/volk_32f_accumulator_s32f_a.h
+++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h
@@ -20,13 +20,13 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i
const float* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
-
+
__m128 accumulator = _mm_setzero_ps();
__m128 aVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
aVal = _mm_load_ps(aPtr);
- accumulator = _mm_add_ps(accumulator, aVal);
+ accumulator = _mm_add_ps(accumulator, aVal);
aPtr += 4;
}
_mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
@@ -34,7 +34,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i
returnValue += tempBuffer[1];
returnValue += tempBuffer[2];
returnValue += tempBuffer[3];
-
+
number = quarterPoints * 4;
for(;number < num_points; number++){
returnValue += (*aPtr++);
diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h
index dda646409..2c469ac42 100644
--- a/volk/include/volk/volk_32f_convert_64f_a.h
+++ b/volk/include/volk/volk_32f_convert_64f_a.h
@@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
double* outputVectorPtr = outputVector;
__m128d ret;
@@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float
for(;number < quarterPoints; number++){
inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
+
ret = _mm_cvtps_pd(inputVal);
_mm_store_pd(outputVectorPtr, ret);
@@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float
outputVectorPtr += 2;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (double)(inputVector[number]);
}
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
index 387baa3b9..10d8a4f6c 100644
--- a/volk/include/volk/volk_32f_convert_64f_u.h
+++ b/volk/include/volk/volk_32f_convert_64f_u.h
@@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
double* outputVectorPtr = outputVector;
__m128d ret;
@@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float
for(;number < quarterPoints; number++){
inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
+
ret = _mm_cvtps_pd(inputVal);
_mm_storeu_pd(outputVectorPtr, ret);
@@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float
outputVectorPtr += 2;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (double)(inputVector[number]);
}
diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h
index 0c43a5081..b9ca1dd3e 100644
--- a/volk/include/volk/volk_32f_index_max_16u_a.h
+++ b/volk/include/volk/volk_32f_index_max_16u_a.h
@@ -52,7 +52,7 @@ static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const f
}
number = quarterPoints * 4;
- for(;number < num_points; number++){
+ for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
@@ -111,7 +111,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa
}
number = quarterPoints * 4;
- for(;number < num_points; number++){
+ for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
@@ -128,11 +128,11 @@ static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const
if(num_points > 0){
float max = src0[0];
unsigned int index = 0;
-
- unsigned int i = 1;
-
+
+ unsigned int i = 1;
+
for(; i < num_points; ++i) {
-
+
if(src0[i] > max){
index = i;
max = src0[i];
diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
index b25df75a1..43713f8b5 100644
--- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
@@ -46,7 +46,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co
inPtr++;
outPtr++;
}
-
+
for (; number < quarterPoints; number++) {
// Load data
next3old1 = _mm_loadu_ps((float*) (inPtr-1));
@@ -65,7 +65,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co
_mm_store_ps(outPtr,next3old1); // Store the results back into the output
outPtr += 4;
}
-
+
for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
@@ -73,7 +73,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co
inPtr++;
outPtr++;
}
-
+
*saveValue = inputVector[num_points-1];
}
#endif /* LV_HAVE_SSE */
@@ -94,14 +94,14 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector
unsigned int number = 0;
float* outPtr = outputVector;
const float* inPtr = inputVector;
-
+
// Do the first 1 by hand since we're going in from the saveValue:
*outPtr = *inPtr - *saveValue;
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
-
+
for (number = 1; number < num_points; number++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
@@ -109,7 +109,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector
inPtr++;
outPtr++;
}
-
+
*saveValue = inputVector[num_points-1];
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
index b1902a8c0..db61e359d 100644
--- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
@@ -23,7 +23,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois
const float* dataPointsPtr = realDataPoints;
__VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
-
+
__m128 dataPointsVal;
__m128 avgPointsVal = _mm_setzero_ps();
// Calculate the sum (for mean) for all points
@@ -73,11 +73,11 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois
// Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-
+
// Count the number of bins which do not exceed the mean amplitude
vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
}
-
+
// Calculate the mean from the remaining data points
_mm_store_ps(avgPointsVector, avgPointsVal);
@@ -104,7 +104,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois
validBinCount += 1.0;
}
}
-
+
float localNoiseFloorAmplitude = 0;
if(validBinCount > 0.0){
localNoiseFloorAmplitude = sumMean / validBinCount;
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
index a24959678..9df4946f2 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
@@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
outputVectorPtr += 8;
}
- number = eighthPoints * 8;
+ number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -76,7 +76,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
@@ -105,7 +105,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
index f58158041..56e42c9bd 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
@@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
outputVectorPtr += 8;
}
- number = eighthPoints * 8;
+ number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -77,7 +77,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
@@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
index 15fa282fb..38e6b2e74 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
@@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int32_t* outputVectorPtr = outputVector;
@@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const
outputVectorPtr += 8;
}
- number = eighthPoints * 8;
+ number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -67,7 +67,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int32_t* outputVectorPtr = outputVector;
@@ -91,7 +91,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const
outputVectorPtr += 4;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -116,7 +116,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int32_t* outputVectorPtr = outputVector;
@@ -144,7 +144,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const
*outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
index d203546c6..ee15edb46 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
@@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int32_t* outputVectorPtr = outputVector;
@@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const
outputVectorPtr += 4;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -68,7 +68,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int32_t* outputVectorPtr = outputVector;
@@ -96,7 +96,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const
*outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
index 05172171c..800017d5d 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
@@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
@@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
intInputVal2 = _mm_cvtps_epi32(inputVal2);
intInputVal3 = _mm_cvtps_epi32(inputVal3);
intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
+
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
@@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
outputVectorPtr += 16;
}
- number = sixteenthPoints * 16;
+ number = sixteenthPoints * 16;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -82,7 +82,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
float min_val = -128;
@@ -110,7 +110,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl
*outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
index 12991e9c1..870e9419b 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
@@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
@@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
intInputVal2 = _mm_cvtps_epi32(inputVal2);
intInputVal3 = _mm_cvtps_epi32(inputVal3);
intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
+
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
@@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
outputVectorPtr += 16;
}
- number = sixteenthPoints * 16;
+ number = sixteenthPoints * 16;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
@@ -83,7 +83,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
@@ -111,7 +111,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
*outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
index d1c6f3f65..99b8e68c5 100644
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
@@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float*
__m128 aVal, bVal, cVal;
bVal = _mm_set_ps1(scalar);
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
@@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float*
__m256 aVal, bVal, cVal;
bVal = _mm256_set1_ps(scalar);
for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
+
+ aVal = _mm256_load_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
_mm256_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 8;
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
index 0e700060f..b3fae9b05 100644
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
@@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float*
__m128 aVal, bVal, cVal;
bVal = _mm_set_ps1(scalar);
for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
+
+ aVal = _mm_loadu_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
_mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
@@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float*
__m256 aVal, bVal, cVal;
bVal = _mm256_set1_ps(scalar);
for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
+
+ aVal = _mm256_loadu_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
_mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 8;
diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h
index 09c905961..633ad14b0 100644
--- a/volk/include/volk/volk_32f_s32f_power_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h
@@ -21,7 +21,7 @@
*/
static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
unsigned int number = 0;
-
+
float* cPtr = cVector;
const float* aPtr = aVector;
@@ -33,22 +33,22 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float*
__m128 negatedValues;
__m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
__m128 onesMask = _mm_set_ps1(1);
-
+
__m128 aVal, cVal;
for(;number < quarterPoints; number++){
-
+
aVal = _mm_load_ps(aPtr);
signMask = _mm_cmplt_ps(aVal, zeroValue);
negatedValues = _mm_sub_ps(zeroValue, aVal);
aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
-
+
// powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
cVal = powf4(aVal, vPower); // Takes each input value to the specified power
cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
+
aPtr += 4;
cPtr += 4;
}
@@ -78,7 +78,7 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float*
*/
static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
unsigned int number = 0;
-
+
float* cPtr = cVector;
const float* aPtr = aVector;
@@ -90,22 +90,22 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV
__m128 negatedValues;
__m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
__m128 onesMask = _mm_set_ps1(1);
-
+
__m128 aVal, cVal;
for(;number < quarterPoints; number++){
-
+
aVal = _mm_load_ps(aPtr);
signMask = _mm_cmplt_ps(aVal, zeroValue);
negatedValues = _mm_sub_ps(zeroValue, aVal);
aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
-
+
// powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
cVal = powf4(aVal, vPower); // Takes each input value to the specified power
cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
+
aPtr += 4;
cPtr += 4;
}
diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
index 75fe0cb2e..98401b2d4 100644
--- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
@@ -29,7 +29,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float*
__m128 aVal1, aVal2, aVal3, aVal4;
__m128 cVal1, cVal2, cVal3, cVal4;
for(;number < sixteenthPoints; number++) {
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
aVal2 = _mm_load_ps(aPtr); aPtr += 4;
@@ -47,12 +47,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float*
squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
}
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
returnValue = squareBuffer[0];
returnValue += squareBuffer[1];
returnValue += squareBuffer[2];
returnValue += squareBuffer[3];
-
+
number = sixteenthPoints * 16;
for(;number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
@@ -93,12 +93,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in
squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
aPtr += 4;
}
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
returnValue = squareBuffer[0];
returnValue += squareBuffer[1];
returnValue += squareBuffer[2];
returnValue += squareBuffer[3];
-
+
number = quarterPoints * 4;
for(;number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
@@ -125,7 +125,7 @@ static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float
if(num_points > 0){
const float* aPtr = inputBuffer;
unsigned int number = 0;
-
+
for(number = 0; number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
aPtr++;
diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h
index e44c73cfd..d9b16fc0f 100644
--- a/volk/include/volk/volk_32f_sqrt_32f_a.h
+++ b/volk/include/volk/volk_32f_sqrt_32f_a.h
@@ -22,11 +22,11 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector,
__m128 aVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
-
- cVal = _mm_sqrt_ps(aVal);
-
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_sqrt_ps(aVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
index 20ff676d8..7de32f7b1 100644
--- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
+++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
@@ -31,7 +31,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float
__m128 aVal1, aVal2, aVal3, aVal4;
__m128 cVal1, cVal2, cVal3, cVal4;
for(;number < sixteenthPoints; number++) {
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
@@ -54,7 +54,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float
squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
}
_mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
newMean = meanBuffer[0];
newMean += meanBuffer[1];
newMean += meanBuffer[2];
@@ -63,7 +63,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float
returnValue += squareBuffer[1];
returnValue += squareBuffer[2];
returnValue += squareBuffer[3];
-
+
number = sixteenthPoints * 16;
for(;number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
@@ -110,7 +110,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m
aPtr += 4;
}
_mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
newMean = meanBuffer[0];
newMean += meanBuffer[1];
newMean += meanBuffer[2];
@@ -119,7 +119,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m
returnValue += squareBuffer[1];
returnValue += squareBuffer[2];
returnValue += squareBuffer[3];
-
+
number = quarterPoints * 4;
for(;number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
@@ -149,7 +149,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, floa
if(num_points > 0){
const float* aPtr = inputBuffer;
unsigned int number = 0;
-
+
for(number = 0; number < num_points; number++){
returnValue += (*aPtr) * (*aPtr);
newMean += *aPtr++;
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h
index 3bc83653b..51e63e54d 100644
--- a/volk/include/volk/volk_32f_x2_add_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_add_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_add_ps(aVal, bVal);
-
+
+ cVal = _mm_add_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
index e360a7958..52e8286bc 100644
--- a/volk/include/volk/volk_32f_x2_add_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_add_32f_u.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVecto
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
+
+ aVal = _mm_loadu_ps(aPtr);
bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_add_ps(aVal, bVal);
-
+
+ cVal = _mm_add_ps(aVal, bVal);
+
_mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h
index 52ddfae87..7b60fb22e 100644
--- a/volk/include/volk/volk_32f_x2_divide_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_div_ps(aVal, bVal);
-
+
+ cVal = _mm_div_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 0c58f2ecf..448b2fdc0 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -18,7 +18,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
for(number = 0; number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
-
+
*result = dotProduct;
}
@@ -29,7 +29,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
+
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
@@ -42,11 +42,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
__m128 dotProdVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
+
+ cVal = _mm_mul_ps(aVal, bVal);
dotProdVal = _mm_add_ps(cVal, dotProdVal);
@@ -69,10 +69,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
}
*result = dotProduct;
-
+
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_SSE3
@@ -91,11 +91,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
__m128 dotProdVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
+
+ cVal = _mm_mul_ps(aVal, bVal);
dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
@@ -117,7 +117,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
}
*result = dotProduct;
-}
+}
#endif /*LV_HAVE_SSE3*/
@@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
__m128 dotProdVal = _mm_setzero_ps();
- for(;number < sixteenthPoints; number++){
+ for(;number < sixteenthPoints; number++){
aVal1 = _mm_load_ps(aPtr); aPtr += 4;
aVal2 = _mm_load_ps(aPtr); aPtr += 4;
@@ -151,7 +151,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
bVal2 = _mm_load_ps(bPtr); bPtr += 4;
bVal3 = _mm_load_ps(bPtr); bPtr += 4;
bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-
+
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
@@ -178,7 +178,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
}
*result = dotProduct;
-}
+}
#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 7f47122ff..3b7284b57 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -17,7 +17,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
for(number = 0; number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
-
+
*result = dotProduct;
}
@@ -28,7 +28,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
+
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
@@ -41,11 +41,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
__m128 dotProdVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
+
+ aVal = _mm_loadu_ps(aPtr);
bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
+
+ cVal = _mm_mul_ps(aVal, bVal);
dotProdVal = _mm_add_ps(cVal, dotProdVal);
@@ -68,10 +68,10 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
}
*result = dotProduct;
-
+
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_SSE3
@@ -90,11 +90,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
__m128 dotProdVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
+
+ aVal = _mm_loadu_ps(aPtr);
bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
+
+ cVal = _mm_mul_ps(aVal, bVal);
dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
@@ -116,7 +116,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
}
*result = dotProduct;
-}
+}
#endif /*LV_HAVE_SSE3*/
@@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
__m128 dotProdVal = _mm_setzero_ps();
for(;number < sixteenthPoints; number++){
-
+
aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
@@ -150,7 +150,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-
+
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
@@ -177,7 +177,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
}
*result = dotProduct;
-}
+}
#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
index 1d4d2dbbd..52d80b6bb 100644
--- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
+++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
@@ -20,7 +20,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c
const float* qBufferPtr = qBuffer;
const uint64_t quarterPoints = num_points / 4;
-
+
__m128 iValue, qValue, cplxValue;
for(;number < quarterPoints; number++){
iValue = _mm_load_ps(iBufferPtr);
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h
index 7948c458d..79f2d04b5 100644
--- a/volk/include/volk/volk_32f_x2_max_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_max_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_max_ps(aVal, bVal);
-
+
+ cVal = _mm_max_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h
index d77134868..42cac0833 100644
--- a/volk/include/volk/volk_32f_x2_min_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_min_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_min_ps(aVal, bVal);
-
+
+ cVal = _mm_min_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h
index fae9a652f..340e05165 100644
--- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* a
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
@@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a
__m256 aVal, bVal, cVal;
for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
+
+ aVal = _mm256_load_ps(aPtr);
bVal = _mm256_load_ps(bPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
_mm256_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 8;
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
index 6c3ce5d83..bfb896d60 100644
--- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* a
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
+
+ aVal = _mm_loadu_ps(aPtr);
bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
_mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
@@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* a
__m256 aVal, bVal, cVal;
for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
+
+ aVal = _mm256_loadu_ps(aPtr);
bVal = _mm256_loadu_ps(bPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
_mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 8;
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
index cc02c3678..10fc267dc 100644
--- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
+++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
@@ -23,7 +23,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec
__m128 vScalar = _mm_set_ps1(scalar);
const unsigned int quarterPoints = num_points / 4;
-
+
__m128 iValue, qValue, cplxValue1, cplxValue2;
__m128i intValue1, intValue2;
@@ -59,7 +59,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec
*complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
*complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
}
-
+
}
#endif /* LV_HAVE_SSE2 */
@@ -81,7 +81,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect
__m128 vScalar = _mm_set_ps1(scalar);
const unsigned int quarterPoints = num_points / 4;
-
+
__m128 iValue, qValue, cplxValue;
int16_t* complexVectorPtr = (int16_t*)complexVector;
@@ -106,9 +106,9 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect
// Interleaves the upper two values in the i and q variables into one buffer
cplxValue = _mm_unpackhi_ps(iValue, qValue);
cplxValue = _mm_mul_ps(cplxValue, vScalar);
-
+
_mm_store_ps(floatBuffer, cplxValue);
-
+
*complexVectorPtr++ = (int16_t)(floatBuffer[0]);
*complexVectorPtr++ = (int16_t)(floatBuffer[1]);
*complexVectorPtr++ = (int16_t)(floatBuffer[2]);
@@ -124,7 +124,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect
*complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
*complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
}
-
+
}
#endif /* LV_HAVE_SSE */
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h
index 16cad008a..e2b8be797 100644
--- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h
@@ -23,12 +23,12 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_sub_ps(aVal, bVal);
-
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
index 153bb3a25..3c530628c 100644
--- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
+++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
@@ -14,33 +14,33 @@
#include<pmmintrin.h>
static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
-
+
+
float result = 0.0;
float fst = 0.0;
float sq = 0.0;
float thrd = 0.0;
float frth = 0.0;
//float fith = 0.0;
-
-
-
+
+
+
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
xmm9 = _mm_setzero_ps();
xmm1 = _mm_setzero_ps();
-
+
xmm0 = _mm_load1_ps(&center_point_array[0]);
xmm6 = _mm_load1_ps(&center_point_array[1]);
xmm7 = _mm_load1_ps(&center_point_array[2]);
xmm8 = _mm_load1_ps(&center_point_array[3]);
//xmm11 = _mm_load1_ps(&center_point_array[4]);
xmm10 = _mm_load1_ps(cutoff);
-
+
int bound = num_bytes >> 4;
int leftovers = (num_bytes >> 2) & 3;
int i = 0;
-
+
for(; i < bound; ++i) {
xmm2 = _mm_load_ps(src0);
xmm2 = _mm_max_ps(xmm10, xmm2);
@@ -57,23 +57,23 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
xmm2 = _mm_add_ps(xmm2, xmm3);
xmm3 = _mm_add_ps(xmm4, xmm5);
-
+
src0 += 4;
-
+
xmm9 = _mm_add_ps(xmm2, xmm9);
-
+
xmm1 = _mm_add_ps(xmm3, xmm1);
//xmm9 = _mm_add_ps(xmm12, xmm9);
}
-
+
xmm2 = _mm_hadd_ps(xmm9, xmm1);
xmm3 = _mm_hadd_ps(xmm2, xmm2);
xmm4 = _mm_hadd_ps(xmm3, xmm3);
_mm_store_ss(&result, xmm4);
-
-
+
+
for(i = 0; i < leftovers; ++i) {
fst = src0[i];
@@ -82,11 +82,11 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
thrd = fst * sq;
frth = sq * sq;
//fith = sq * thrd;
-
- result += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);// +
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
//center_point_array[4] * fith);
}
@@ -94,7 +94,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
target[0] = result;
}
-
+
#endif /*LV_HAVE_SSE3*/
@@ -103,45 +103,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
+
float result = 0.0;
float fst = 0.0;
float sq = 0.0;
float thrd = 0.0;
float frth = 0.0;
//float fith = 0.0;
-
- unsigned int i = 0;
-
+
+ unsigned int i = 0;
+
for(; i < num_bytes >> 2; ++i) {
fst = src0[i];
fst = MAX(fst, *cutoff);
-
+
sq = fst * fst;
thrd = fst * sq;
frth = sq * sq;
//fith = sq * thrd;
-
- result += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
center_point_array[3] * frth); //+
//center_point_array[4] * fith);
- /*printf("%f12...%d\n", (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
center_point_array[3] * frth) +
- //center_point_array[4] * fith) +
+ //center_point_array[4] * fith) +
(center_point_array[4]), i);
*/
}
result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
-
-
+
+
*target = result;
}
diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
index b7350b9fa..28d584bf2 100644
--- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
@@ -23,11 +23,11 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l
__m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
for(;number < quarterPoints; number++){
-
+
aVal1 = _mm_load_ps((const float*)aPtr);
aPtr += 2;
-
- aVal2 = _mm_load_ps((const float*)aPtr);
+
+ aVal2 = _mm_load_ps((const float*)aPtr);
aPtr += 2;
bVal = _mm_load_ps(bPtr);
@@ -36,13 +36,13 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l
bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
- cVal = _mm_mul_ps(aVal1, bVal1);
-
+ cVal = _mm_mul_ps(aVal1, bVal1);
+
_mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
cPtr += 2;
- cVal = _mm_mul_ps(aVal2, bVal2);
-
+ cVal = _mm_mul_ps(aVal2, bVal2);
+
_mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
cPtr += 2;
@@ -69,7 +69,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con
const lv_32fc_t* aPtr = aVector;
const float* bPtr= bVector;
unsigned int number = 0;
-
+
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
index 1518af9be..919280d51 100644
--- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
@@ -25,11 +25,11 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
-
+
x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
x = _mm_xor_ps(x, conjugator); // conjugate register
-
+
_mm_store_ps((float*)c,x); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
index b26fe0789..e0d79ea7b 100644
--- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
@@ -21,13 +21,13 @@ static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_
__m128 x;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
-
+
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
-
+
x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
-
+
x = _mm_xor_ps(x, conjugator); // conjugate register
_mm_storeu_ps((float*)c,x); // Store the results back into the C container
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
index 9de036ef4..4106f3851 100644
--- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
@@ -19,10 +19,10 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB
float* qBufferPtr = qBuffer;
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 cplxValue1, cplxValue2, iValue, qValue;
for(;number < quarterPoints; number++){
-
+
cplxValue1 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
index 29c369d9a..77566e671 100644
--- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
@@ -20,23 +20,23 @@ static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double*
double* iBufferPtr = iBuffer;
double* qBufferPtr = qBuffer;
- const unsigned int halfPoints = num_points / 2;
+ const unsigned int halfPoints = num_points / 2;
__m128 cplxValue, fVal;
__m128d dVal;
for(;number < halfPoints; number++){
-
+
cplxValue = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
// Arrange in i1i2i1i2 format
fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
- dVal = _mm_cvtps_pd(fVal);
+ dVal = _mm_cvtps_pd(fVal);
_mm_store_pd(iBufferPtr, dVal);
// Arrange in q1q2q1q2 format
fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
- dVal = _mm_cvtps_pd(fVal);
+ dVal = _mm_cvtps_pd(fVal);
_mm_store_pd(qBufferPtr, dVal);
iBufferPtr += 2;
diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
index adc4112b9..c88809beb 100644
--- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
+++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
@@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l
__m128 cplxValue1, cplxValue2, iValue;
for(;number < quarterPoints; number++){
-
+
cplxValue1 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
index a1d0fd5d1..0d6c6b7af 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
+++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
@@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l
__m128 cplxValue1, cplxValue2, iValue;
for(;number < quarterPoints; number++){
-
+
cplxValue1 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
index 70a3b1971..1e346baca 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
+++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
@@ -18,17 +18,17 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const
const float* complexVectorPtr = (float*)complexVector;
double* iBufferPtr = iBuffer;
- const unsigned int halfPoints = num_points / 2;
+ const unsigned int halfPoints = num_points / 2;
__m128 cplxValue, fVal;
__m128d dVal;
for(;number < halfPoints; number++){
-
+
cplxValue = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
// Arrange in i1i2i1i2 format
fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
- dVal = _mm_cvtps_pd(fVal);
+ dVal = _mm_cvtps_pd(fVal);
_mm_store_pd(iBufferPtr, dVal);
iBufferPtr += 2;
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h
index 125a34582..842a6a042 100644
--- a/volk/include/volk/volk_32fc_index_max_16u_a.h
+++ b/volk/include/volk/volk_32fc_index_max_16u_a.h
@@ -12,16 +12,16 @@
static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
-
-
-
+
+
+
union bit128 holderf;
union bit128 holderi;
float sq_dist = 0.0;
-
+
union bit128 xmm5, xmm4;
__m128 xmm1, xmm2, xmm3;
__m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
@@ -30,63 +30,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
xmm4.int_vec = xmmfour = _mm_setzero_si128();
holderf.int_vec = holder0 = _mm_setzero_si128();
holderi.int_vec = holder1 = _mm_setzero_si128();
-
-
+
+
int bound = num_bytes >> 5;
int leftovers0 = (num_bytes >> 4) & 1;
int leftovers1 = (num_bytes >> 3) & 1;
int i = 0;
-
-
+
+
xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
xmm9 = xmm8 = _mm_setzero_si128();
xmm10 = _mm_set_epi32(4, 4, 4, 4);
xmm3 = _mm_setzero_ps();
;
-
+
//printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
-
+
for(; i < bound; ++i) {
-
+
xmm1 = _mm_load_ps((float*)src0);
xmm2 = _mm_load_ps((float*)&src0[2]);
-
+
src0 += 4;
-
-
+
+
xmm1 = _mm_mul_ps(xmm1, xmm1);
xmm2 = _mm_mul_ps(xmm2, xmm2);
-
-
+
+
xmm1 = _mm_hadd_ps(xmm1, xmm2);
xmm3 = _mm_max_ps(xmm1, xmm3);
-
+
xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-
-
-
+
+
+
xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-
+
xmm9 = _mm_add_epi32(xmm11, xmm12);
- xmm8 = _mm_add_epi32(xmm8, xmm10);
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
-
//printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
//printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
}
-
-
+
+
for(i = 0; i < leftovers0; ++i) {
xmm2 = _mm_load_ps((float*)src0);
-
+
xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
xmm8 = bit128_p(&xmm1)->int_vec;
@@ -99,63 +99,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
xmm3 = _mm_max_ps(xmm1, xmm3);
xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
-
-
+
+
xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-
-
-
+
+
+
xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-
+
xmm9 = _mm_add_epi32(xmm11, xmm12);
- xmm8 = _mm_add_epi32(xmm8, xmm10);
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
//printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
}
-
-
-
+
+
+
for(i = 0; i < leftovers1; ++i) {
//printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
+
sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-
+
xmm2 = _mm_load1_ps(&sq_dist);
xmm1 = xmm3;
-
+
xmm3 = _mm_max_ss(xmm3, xmm2);
-
-
+
+
xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-
-
- xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
-
+
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
-
+
xmm9 = _mm_add_epi32(xmm11, xmm12);
}
-
+
//printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
//printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
_mm_store_ps((float*)&(holderf.f), xmm3);
_mm_store_si128(&(holderi.int_vec), xmm9);
-
+
target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
+ sq_dist = holderf.f[0];
target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
@@ -163,27 +163,27 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-
-
+
+
/*
float placeholder = 0.0;
- uint32_t temp0, temp1;
+ uint32_t temp0, temp1;
unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
unsigned int l0 = g0 ^ 1;
unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
unsigned int l1 = g1 ^ 1;
-
+
temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
-
+
g0 = (sq_dist > placeholder);
l0 = g0 ^ 1;
target[0] = g0 * temp0 + l0 * temp1;
*/
-
+
}
#endif /*LV_HAVE_SSE3*/
@@ -193,20 +193,20 @@ static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32
float sq_dist = 0.0;
float max = 0.0;
unsigned int index = 0;
-
- unsigned int i = 0;
-
+
+ unsigned int i = 0;
+
for(; i < num_bytes >> 3; ++i) {
sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
-
+
index = sq_dist > max ? i : index;
max = sq_dist > max ? sq_dist : max;
-
-
+
+
}
target[0] = index;
-
+
}
#endif /*LV_HAVE_GENERIC*/
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h
index f18e9bc0b..efb84a904 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_a.h
+++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h
@@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const
static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h
index ed1cedef9..c8b3f0a08 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_u.h
+++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h
@@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const
static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
index 00bdefbb5..d3ac9717a 100644
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
@@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector
static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
index 6eb4a523a..53a4e68eb 100644
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
@@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector
static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
index 7bd001aa0..d86bd63c1 100644
--- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
+++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
@@ -27,14 +27,14 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const
const float invNormalizeFactor = 1.0 / normalizeFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 testVector = _mm_set_ps1(2*M_PI);
__m128 correctVector = _mm_set_ps1(M_PI);
__m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
__m128 phase;
__m128 complex1, complex2, iValue, qValue;
__m128 keepMask;
-
+
for (; number < quarterPoints; number++) {
// Load IQ data:
complex1 = _mm_load_ps(complexVectorPtr);
@@ -42,15 +42,15 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const
complex2 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
// Deinterleave IQ data:
- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
// Arctan to get phase:
phase = atan2f4(qValue, iValue);
// When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
// Compare to 2pi:
keepMask = _mm_cmpneq_ps(phase,testVector);
phase = _mm_blendv_ps(correctVector, phase, keepMask);
- // done with above correction.
+ // done with above correction.
phase = _mm_mul_ps(phase, vNormalizeFactor);
_mm_store_ps((float*)outPtr, phase);
outPtr += 4;
@@ -89,7 +89,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv
const float invNormalizeFactor = 1.0 / normalizeFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 testVector = _mm_set_ps1(2*M_PI);
__m128 correctVector = _mm_set_ps1(M_PI);
__m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
@@ -97,7 +97,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv
__m128 complex1, complex2, iValue, qValue;
__m128 mask;
__m128 keepMask;
-
+
for (; number < quarterPoints; number++) {
// Load IQ data:
complex1 = _mm_load_ps(complexVectorPtr);
@@ -105,8 +105,8 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv
complex2 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
// Deinterleave IQ data:
- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
// Arctan to get phase:
phase = atan2f4(qValue, iValue);
// When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
@@ -115,7 +115,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv
phase = _mm_and_ps(phase, keepMask);
mask = _mm_andnot_ps(keepMask, correctVector);
phase = _mm_or_ps(phase, mask);
- // done with above correction.
+ // done with above correction.
phase = _mm_mul_ps(phase, vNormalizeFactor);
_mm_store_ps((float*)outPtr, phase);
outPtr += 4;
diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
index 588b532b4..3106edbef 100644
--- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
@@ -28,55 +28,55 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con
*/
static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
unsigned int number = 0;
-
+
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
#ifdef LV_HAVE_LIB_SIMDMATH
const unsigned int quarterPoints = num_points / 4;
__m128 vPower = _mm_set_ps1(power);
-
+
__m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
for(;number < quarterPoints; number++){
-
- cplxValue1 = _mm_load_ps((float*)aPtr);
+
+ cplxValue1 = _mm_load_ps((float*)aPtr);
aPtr += 2;
-
- cplxValue2 = _mm_load_ps((float*)aPtr);
+
+ cplxValue2 = _mm_load_ps((float*)aPtr);
aPtr += 2;
-
+
// Convert to polar coordinates
-
+
// Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// Arrange in q1q2q3q4 format
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
+
phase = atan2f4(qValue, iValue); // Calculate the Phase
-
+
magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
-
+
// Now calculate the power of the polar coordinate data
magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
-
+
phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
-
+
// Convert back to cartesian coordinates
iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
-
+
cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
-
+
_mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
-
+
cPtr += 2;
-
+
_mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
-
+
cPtr += 2;
}
-
+
number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
index 8d1959dae..30a77dbc1 100644
--- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
+++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
@@ -34,7 +34,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu
__m128 input1, input2;
const uint64_t quarterPoints = num_points / 4;
for(;number < quarterPoints; number++){
- // Load the complex values
+ // Load the complex values
input1 =_mm_load_ps(inputPtr);
inputPtr += 4;
input2 =_mm_load_ps(inputPtr);
@@ -43,30 +43,30 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu
// Apply the normalization factor
input1 = _mm_mul_ps(input1, invNormalizationFactor);
input2 = _mm_mul_ps(input2, invNormalizationFactor);
-
+
// Multiply each value by itself
// (r1*r1), (i1*i1), (r2*r2), (i2*i2)
input1 = _mm_mul_ps(input1, input1);
// (r3*r3), (i3*i3), (r4*r4), (i4*i4)
input2 = _mm_mul_ps(input2, input2);
-
+
// Horizontal add, to add (r*r) + (i*i) for each complex value
// (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
power = _mm_hadd_ps(input1, input2);
-
+
// Calculate the natural log power
power = logf4(power);
-
+
// Convert to log10 and multiply by 10.0
power = _mm_mul_ps(power, magScalar);
-
+
// Store the floating point results
_mm_store_ps(destPtr, power);
-
+
destPtr += 4;
}
-
- number = quarterPoints*4;
+
+ number = quarterPoints*4;
#endif /* LV_HAVE_LIB_SIMDMATH */
// Calculate the FFT for any remaining points
@@ -81,10 +81,10 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu
const float imag = *inputPtr++ * iNormalizationFactor;
*destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
-
+
destPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE3 */
@@ -114,7 +114,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOu
*realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
-
+
realFFTDataPointsPtr++;
}
}
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
index fc635f171..27f755351 100644
--- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
+++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
@@ -32,19 +32,19 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
magScalar = _mm_div_ps(magScalar, logf4(magScalar));
__m128 invRBW = _mm_set_ps1(iRBW);
-
+
__m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
__m128 power;
__m128 input1, input2;
const uint64_t quarterPoints = num_points / 4;
for(;number < quarterPoints; number++){
- // Load the complex values
+ // Load the complex values
input1 =_mm_load_ps(inputPtr);
inputPtr += 4;
input2 =_mm_load_ps(inputPtr);
inputPtr += 4;
-
+
// Apply the normalization factor
input1 = _mm_mul_ps(input1, invNormalizationFactor);
input2 = _mm_mul_ps(input2, invNormalizationFactor);
@@ -54,7 +54,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
input1 = _mm_mul_ps(input1, input1);
// (r3*r3), (i3*i3), (r4*r4), (i4*i4)
input2 = _mm_mul_ps(input2, input2);
-
+
// Horizontal add, to add (r*r) + (i*i) for each complex value
// (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
power = _mm_hadd_ps(input1, input2);
@@ -64,17 +64,17 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
// Calculate the natural log power
power = logf4(power);
-
+
// Convert to log10 and multiply by 10.0
power = _mm_mul_ps(power, magScalar);
-
+
// Store the floating point results
_mm_store_ps(destPtr, power);
-
+
destPtr += 4;
}
-
- number = quarterPoints*4;
+
+ number = quarterPoints*4;
#endif /* LV_HAVE_LIB_SIMDMATH */
// Calculate the FFT for any remaining points
for(; number < num_points; number++){
@@ -83,14 +83,14 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
// 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
// 75 ohm load assumption
// 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
+
const float real = *inputPtr++ * iNormalizationFactor;
const float imag = *inputPtr++ * iNormalizationFactor;
*destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
destPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE3 */
@@ -122,7 +122,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float*
const float imag = *inputPtr++ * iNormalizationFactor;
*realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-
+
realFFTDataPointsPtr++;
}
}
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
index 534dc2a25..f206c5e87 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
yh = _mm_set_ps1(lv_cimag(scalar));
for(;number < halfPoints; number++){
-
+
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_store_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
index 218c450f8..5c7d15b02 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
yh = _mm_set_ps1(lv_cimag(scalar));
for(;number < halfPoints; number++){
-
+
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
index 655075528..e3dedf2fc 100644
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
@@ -10,40 +10,40 @@
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
float * res = (float*) result;
float * in = (float*) input;
float * tp = (float*) taps;
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
unsigned int isodd = (num_bytes >> 3) &1;
-
-
-
+
+
+
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
-
+
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
+
sum0[0] += in[0] * tp[0] + in[1] * tp[1];
sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
sum1[0] += in[2] * tp[2] + in[3] * tp[3];
sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-
-
+
+
in += 4;
tp += 4;
}
-
-
+
+
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
-
-
-
+
+
+
for(i = 0; i < isodd; ++i) {
@@ -64,13 +64,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
- asm volatile
+
+ asm volatile
(
"# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
"# const float *taps, unsigned num_bytes)\n\t"
@@ -187,32 +187,32 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
:"rax", "r8", "r9", "r10"
);
-
-
+
+
int getem = num_bytes % 16;
-
-
+
+
for(; getem > 0; getem -= 8) {
-
-
+
+
*result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-
+
}
return;
-}
+}
#endif
#if LV_HAVE_SSE && LV_HAVE_32
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
int bound = num_bytes >> 4;
int leftovers = num_bytes % 16;
-
- asm volatile
+
+ asm volatile
(
" #pushl %%ebp\n\t"
" #movl %%esp, %%ebp\n\t"
@@ -226,7 +226,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu
" movaps 0(%[edx]), %%xmm2\n\t"
" movl %[ecx], (%[out])\n\t"
" shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
-
+
" xorps %%xmm1, %%xmm2\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
@@ -317,28 +317,28 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu
: [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
);
-
-
-
+
+
+
printf("%d, %d\n", leftovers, bound);
-
+
for(; leftovers > 0; leftovers -= 8) {
-
-
+
+
*result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
-
+
}
-
+
return;
-
-
-
-
-
+
+
+
+
+
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
index 3ae7208a8..e7493413f 100644
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -9,39 +9,39 @@
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
float * res = (float*) result;
float * in = (float*) input;
float * tp = (float*) taps;
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
unsigned int isodd = (num_bytes >> 3) &1;
-
-
-
+
+
+
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
-
+
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
+
sum0[0] += in[0] * tp[0] + in[1] * tp[1];
sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
sum1[0] += in[2] * tp[2] + in[3] * tp[3];
sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-
-
+
+
in += 4;
tp += 4;
}
-
-
+
+
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
-
-
-
+
+
+
for(i = 0; i < isodd; ++i) {
@@ -73,7 +73,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
uint32_t intRep[4];
__m128 vec;
} halfMask;
-
+
union NegMask {
int intRep[4];
__m128 vec;
@@ -85,13 +85,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
__m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
__m128 zv = {0,0,0,0};
-
+
halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
negMask.intRep[1] = negMask.intRep[3] = 0;
-
+
// main loop
while(num_bytes >= 4*sizeof(float)){
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
index cde9240cc..cb2ac4c67 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
@@ -7,44 +7,44 @@
#include <string.h>
-#ifdef LV_HAVE_GENERIC
+#ifdef LV_HAVE_GENERIC
static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
float * res = (float*) result;
float * in = (float*) input;
float * tp = (float*) taps;
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
unsigned int isodd = (num_bytes >> 3) &1;
-
-
-
+
+
+
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
-
+
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
+
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-
-
+
+
in += 4;
tp += 4;
}
-
+
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
-
-
-
+
+
+
for(i = 0; i < isodd; ++i) {
@@ -61,9 +61,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
- asm
+
+ asm
(
"# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
"# const float *taps, unsigned num_bytes)\n\t"
@@ -175,20 +175,20 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
:"rax", "r8", "r9", "r10"
);
-
-
+
+
int getem = num_bytes % 16;
-
-
+
+
for(; getem > 0; getem -= 8) {
-
-
+
+
*result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-
+
}
return;
-
+
}
#endif
@@ -200,7 +200,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const
volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes);
#if 0
- asm volatile
+ asm volatile
(
" #pushl %%ebp\n\t"
" #movl %%esp, %%ebp\n\t"
@@ -299,28 +299,28 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const
: "eax", "ecx", "edx"
);
-
+
int getem = num_bytes % 16;
-
+
for(; getem > 0; getem -= 8) {
-
-
+
+
*result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-
+
}
-
+
return;
-#endif
+#endif
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
+
lv_32fc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(float));
@@ -336,19 +336,19 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
dotProdVal = _mm_setzero_ps();
for(;number < halfPoints; number++){
-
+
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
+
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
@@ -368,7 +368,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
}
*result = dotProduct;
-}
+}
#endif /*LV_HAVE_SSE3*/
@@ -379,7 +379,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes);
// SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
- /*
+ /*
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64 *p_result;
@@ -391,7 +391,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
static const __m128i neg = {0x000000000000000080000000};
int i = 0;
-
+
int bound = (num_bytes >> 5);
int leftovers = (num_bytes & 24) >> 3;
@@ -399,27 +399,27 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
real1 = _mm_sub_ps(real1, real1);
im0 = _mm_sub_ps(im0, im0);
im1 = _mm_sub_ps(im1, im1);
-
+
for(; i < bound; ++i) {
-
-
+
+
xmm0 = _mm_load_ps(p_input);
xmm1 = _mm_load_ps(p_taps);
-
+
p_input += 4;
p_taps += 4;
-
+
xmm2 = _mm_load_ps(p_input);
xmm3 = _mm_load_ps(p_taps);
-
+
p_input += 4;
p_taps += 4;
-
+
xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
-
+
//imaginary vector from input
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
//real vector from input
@@ -428,39 +428,39 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
//real vector from taps
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
-
+
xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
-
+
xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
-
+
real0 = _mm_add_ps(xmm4, real0);
real1 = _mm_add_ps(xmm5, real1);
im0 = _mm_add_ps(xmm6, im0);
im1 = _mm_add_ps(xmm7, im1);
-
+
}
-
-
+
+
real1 = _mm_xor_ps(real1, (__m128)neg);
-
-
+
+
im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
-
+
im0 = _mm_add_ps(im0, real0);
-
+
_mm_storel_pi(p_result, im0);
-
+
for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
-
+
*result += input[i] * taps[i];
}
*/
-}
+}
#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
index aec8bd716..f79ddb59b 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
@@ -24,21 +24,21 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
for(;number < halfPoints; number++){
-
+
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
+
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_store_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
index 729c1a4ad..a998d6184 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
@@ -25,21 +25,21 @@ static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const l
const lv_32fc_t* b = bVector;
for(;number < halfPoints; number++){
-
+
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
+
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
index 2a1bcbce0..2755192e9 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
@@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
-
+
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
y = _mm_xor_ps(y, conjugator); // conjugate y
-
+
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_store_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
index 92f6a051e..09dcd635b 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
@@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
-
+
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
y = _mm_xor_ps(y, conjugator); // conjugate y
-
+
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
+
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
+
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
+
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
+
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
+
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
a += 2;
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
index 2d5f36b27..75eb9173d 100644
--- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
+++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
@@ -11,7 +11,7 @@
#include<pmmintrin.h>
static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-
+
__m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
@@ -23,31 +23,31 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t
int leftovers0 = (num_bytes >> 4) & 1;
int leftovers1 = (num_bytes >> 3) & 1;
int i = 0;
-
-
-
+
+
+
xmm1 = _mm_setzero_ps();
- xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
xmm2 = _mm_load_ps((float*)&points[0]);
xmm8 = _mm_load1_ps(&scalar);
xmm1 = _mm_movelh_ps(xmm1, xmm1);
xmm3 = _mm_load_ps((float*)&points[2]);
-
-
+
+
for(; i < bound - 1; ++i) {
-
+
xmm4 = _mm_sub_ps(xmm1, xmm2);
xmm5 = _mm_sub_ps(xmm1, xmm3);
points += 4;
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm7 = _mm_mul_ps(xmm5, xmm5);
-
+
xmm2 = _mm_load_ps((float*)&points[0]);
-
+
xmm4 = _mm_hadd_ps(xmm6, xmm7);
xmm3 = _mm_load_ps((float*)&points[2]);
-
+
xmm4 = _mm_mul_ps(xmm4, xmm8);
_mm_store_ps(target, xmm4);
@@ -55,46 +55,46 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t
target += 4;
}
-
+
xmm4 = _mm_sub_ps(xmm1, xmm2);
xmm5 = _mm_sub_ps(xmm1, xmm3);
-
-
+
+
points += 4;
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm7 = _mm_mul_ps(xmm5, xmm5);
-
+
xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
+
xmm4 = _mm_mul_ps(xmm4, xmm8);
-
+
_mm_store_ps(target, xmm4);
-
+
target += 4;
-
+
for(i = 0; i < leftovers0; ++i) {
-
+
xmm2 = _mm_load_ps((float*)&points[0]);
-
+
xmm4 = _mm_sub_ps(xmm1, xmm2);
-
+
points += 2;
-
+
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm4 = _mm_hadd_ps(xmm6, xmm6);
xmm4 = _mm_mul_ps(xmm4, xmm8);
-
+
_mm_storeh_pi((__m64*)target, xmm4);
target += 2;
}
for(i = 0; i < leftovers1; ++i) {
-
+
diff = src0[0] - points[0];
sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
@@ -109,13 +109,13 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t
static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
lv_32fc_t diff;
float sq_dist;
- unsigned int i = 0;
-
+ unsigned int i = 0;
+
for(; i < num_bytes >> 3; ++i) {
diff = src0[0] - points[i];
sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-
+
target[i] = sq_dist;
}
}
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
index 6a4a08ca5..b819eaffd 100644
--- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
+++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
@@ -10,7 +10,7 @@
#include<pmmintrin.h>
static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-
+
__m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -22,11 +22,11 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
int i = 0;
xmm1 = _mm_setzero_ps();
- xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
xmm2 = _mm_load_ps((float*)&points[0]);
xmm1 = _mm_movelh_ps(xmm1, xmm1);
xmm3 = _mm_load_ps((float*)&points[2]);
-
+
for(; i < bound - 1; ++i) {
xmm4 = _mm_sub_ps(xmm1, xmm2);
@@ -34,9 +34,9 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
points += 4;
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm7 = _mm_mul_ps(xmm5, xmm5);
-
+
xmm2 = _mm_load_ps((float*)&points[0]);
-
+
xmm4 = _mm_hadd_ps(xmm6, xmm7);
xmm3 = _mm_load_ps((float*)&points[2]);
@@ -46,41 +46,41 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
target += 4;
}
-
+
xmm4 = _mm_sub_ps(xmm1, xmm2);
xmm5 = _mm_sub_ps(xmm1, xmm3);
-
-
+
+
points += 4;
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm7 = _mm_mul_ps(xmm5, xmm5);
-
+
xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
+
_mm_store_ps(target, xmm4);
-
+
target += 4;
for(i = 0; i < leftovers0; ++i) {
-
+
xmm2 = _mm_load_ps((float*)&points[0]);
-
+
xmm4 = _mm_sub_ps(xmm1, xmm2);
-
+
points += 2;
-
+
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm4 = _mm_hadd_ps(xmm6, xmm6);
-
+
_mm_storeh_pi((__m64*)target, xmm4);
target += 2;
}
for(i = 0; i < leftovers1; ++i) {
-
+
diff = src0[0] - points[0];
sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
@@ -95,13 +95,13 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
lv_32fc_t diff;
float sq_dist;
- unsigned int i = 0;
-
+ unsigned int i = 0;
+
for(; i < num_bytes >> 3; ++i) {
diff = src0[0] - points[i];
sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
+
target[i] = sq_dist;
}
}
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
index 558142869..8f4123d71 100644
--- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
@@ -17,7 +17,7 @@
static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
float* outputVectorPtr = outputVector;
const float iScalar = 1.0 / scalar;
__m128 invScalar = _mm_set_ps1(iScalar);
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
index d8afd218c..b3a8ab201 100644
--- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
@@ -18,7 +18,7 @@
static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
float* outputVectorPtr = outputVector;
const float iScalar = 1.0 / scalar;
__m128 invScalar = _mm_set_ps1(iScalar);
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h
index dcd63d98e..e5330847b 100644
--- a/volk/include/volk/volk_32i_x2_and_32i_a.h
+++ b/volk/include/volk/volk_32i_x2_and_32i_a.h
@@ -23,12 +23,12 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_and_ps(aVal, bVal);
-
+
+ cVal = _mm_and_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h
index 243e8178c..24045894c 100644
--- a/volk/include/volk/volk_32i_x2_or_32i_a.h
+++ b/volk/include/volk/volk_32i_x2_or_32i_a.h
@@ -23,12 +23,12 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
+
+ aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_or_ps(aVal, bVal);
-
+
+ cVal = _mm_or_ps(aVal, bVal);
+
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h
index b88848096..71ae027d3 100644
--- a/volk/include/volk/volk_32u_byteswap_a.h
+++ b/volk/include/volk/volk_32u_byteswap_a.h
@@ -39,9 +39,9 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n
_mm_store_si128((__m128i*)inputPtr, output);
inputPtr += 4;
}
-
+
// Byteswap any remaining points:
- number = quarterPoints*4;
+ number = quarterPoints*4;
for(; number < num_points; number++){
uint32_t outputVal = *inputPtr;
outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
@@ -64,7 +64,7 @@ static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned in
for(point = 0; point < num_points; point++){
uint32_t output = *inputPtr;
output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-
+
*inputPtr = output;
inputPtr++;
}
diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h
index 2126e4f95..11d51702b 100644
--- a/volk/include/volk/volk_64f_convert_32f_a.h
+++ b/volk/include/volk/volk_64f_convert_32f_a.h
@@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const double* inputVectorPtr = (const double*)inputVector;
float* outputVectorPtr = outputVector;
__m128 ret, ret2;
@@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double
for(;number < quarterPoints; number++){
inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-
+
ret = _mm_cvtpd_ps(inputVal1);
ret2 = _mm_cvtpd_ps(inputVal2);
@@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double
outputVectorPtr += 4;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (float)(inputVector[number]);
}
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
index 5c323230a..31dc5b5fe 100644
--- a/volk/include/volk/volk_64f_convert_32f_u.h
+++ b/volk/include/volk/volk_64f_convert_32f_u.h
@@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
-
+
const double* inputVectorPtr = (const double*)inputVector;
float* outputVectorPtr = outputVector;
__m128 ret, ret2;
@@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double
for(;number < quarterPoints; number++){
inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-
+
ret = _mm_cvtpd_ps(inputVal1);
ret2 = _mm_cvtpd_ps(inputVal2);
@@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double
outputVectorPtr += 4;
}
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (float)(inputVector[number]);
}
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h
index 61a704c52..33aae6d10 100644
--- a/volk/include/volk/volk_64f_x2_max_64f_a.h
+++ b/volk/include/volk/volk_64f_x2_max_64f_a.h
@@ -23,12 +23,12 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe
__m128d aVal, bVal, cVal;
for(;number < halfPoints; number++){
-
- aVal = _mm_load_pd(aPtr);
+
+ aVal = _mm_load_pd(aPtr);
bVal = _mm_load_pd(bPtr);
-
- cVal = _mm_max_pd(aVal, bVal);
-
+
+ cVal = _mm_max_pd(aVal, bVal);
+
_mm_store_pd(cPtr,cVal); // Store the results back into the C container
aPtr += 2;
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h
index 148b72c59..25d8b4c98 100644
--- a/volk/include/volk/volk_64f_x2_min_64f_a.h
+++ b/volk/include/volk/volk_64f_x2_min_64f_a.h
@@ -23,12 +23,12 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe
__m128d aVal, bVal, cVal;
for(;number < halfPoints; number++){
-
- aVal = _mm_load_pd(aPtr);
+
+ aVal = _mm_load_pd(aPtr);
bVal = _mm_load_pd(bPtr);
-
- cVal = _mm_min_pd(aVal, bVal);
-
+
+ cVal = _mm_min_pd(aVal, bVal);
+
_mm_store_pd(cPtr,cVal); // Store the results back into the C container
aPtr += 2;
diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h
index d4fc74a6e..3d1d87623 100644
--- a/volk/include/volk/volk_64u_byteswap_a.h
+++ b/volk/include/volk/volk_64u_byteswap_a.h
@@ -34,7 +34,7 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n
output = _mm_or_si128(output, byte2);
byte3 = _mm_and_si128(byte3, byte3mask);
output = _mm_or_si128(output, byte3);
-
+
// Reorder the two words
output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
@@ -42,17 +42,17 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n
_mm_store_si128((__m128i*)inputPtr, output);
inputPtr += 4;
}
-
+
// Byteswap any remaining points:
- number = halfPoints*2;
+ number = halfPoints*2;
for(; number < num_points; number++){
uint32_t output1 = *inputPtr;
uint32_t output2 = inputPtr[1];
-
+
output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
+
output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
+
*inputPtr++ = output2;
*inputPtr++ = output1;
}
@@ -71,11 +71,11 @@ static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned in
for(point = 0; point < num_points; point++){
uint32_t output1 = *inputPtr;
uint32_t output2 = inputPtr[1];
-
+
output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
+
output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
+
*inputPtr++ = output2;
*inputPtr++ = output1;
}
diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h
index 4683f1e38..7d7359ccf 100644
--- a/volk/include/volk/volk_64u_popcnt_a.h
+++ b/volk/include/volk/volk_64u_popcnt_a.h
@@ -11,7 +11,7 @@
static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) {
//const uint32_t* valueVector = (const uint32_t*)&value;
-
+
// This is faster than a lookup table
//uint32_t retVal = valueVector[0];
uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF);
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h
index 7f2623ac6..02a7f356e 100644
--- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h
@@ -17,7 +17,7 @@
static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
float* outputVectorPtr = outputVector;
const float iScalar = 1.0 / scalar;
__m128 invScalar = _mm_set_ps1(iScalar);
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
index 3cd6bb67c..8bb2c0d1a 100644
--- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
@@ -18,7 +18,7 @@
static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int sixteenthPoints = num_points / 16;
-
+
float* outputVectorPtr = outputVector;
const float iScalar = 1.0 / scalar;
__m128 invScalar = _mm_set_ps1( iScalar );
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
index b723c6f8b..d82da59fb 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
@@ -20,7 +20,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl
float* qBufferPtr = qBuffer;
unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
__m128 iFloatValue, qFloatValue;
const float iScalar= 1.0 / scalar;
@@ -71,7 +71,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl
*iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
*qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
}
-
+
}
#endif /* LV_HAVE_SSE4_1 */
@@ -90,7 +90,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float
float* qBufferPtr = qBuffer;
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 cplxValue1, cplxValue2, iValue, qValue;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
@@ -103,7 +103,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float
floatBuffer[1] = (float)(complexVectorPtr[1]);
floatBuffer[2] = (float)(complexVectorPtr[2]);
floatBuffer[3] = (float)(complexVectorPtr[3]);
-
+
floatBuffer[4] = (float)(complexVectorPtr[4]);
floatBuffer[5] = (float)(complexVectorPtr[5]);
floatBuffer[6] = (float)(complexVectorPtr[6]);
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
index 74073f5a6..b2c15d3a3 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
@@ -18,7 +18,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
float* iBufferPtr = iBuffer;
unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
__m128 iFloatValue;
const float iScalar= 1.0 / scalar;
@@ -57,7 +57,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
*iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
complexVectorPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE4_1 */
@@ -75,7 +75,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con
float* iBufferPtr = iBuffer;
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
__m128 iValue;
const float iScalar= 1.0 / scalar;
@@ -88,7 +88,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con
floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
iValue = _mm_load_ps(floatBuffer);
@@ -104,7 +104,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con
*iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
complexVectorPtr++;
}
-
+
}
#endif /* LV_HAVE_SSE */
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
index 0c280eb6e..f85fdb999 100644
--- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
+++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
@@ -23,15 +23,15 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect
const lv_8sc_t* a = aVector;
const lv_8sc_t* b = bVector;
__m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-
+
for(;number < quarterPoints; number++){
// Convert into 8 bit values into 16 bit values
x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
-
+
// Calculate the ar*cr - ai*(-ci) portions
realz = _mm_madd_epi16(x,y);
-
+
// Calculate the complex conjugate of the cr + ci j values
y = _mm_sign_epi16(y, conjugateSign);
@@ -47,7 +47,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect
b += 4;
c += 4;
}
-
+
number = quarterPoints * 4;
int16_t* c16Ptr = (int16_t*)&cVector[number];
int8_t* a8Ptr = (int8_t*)&aVector[number];
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
index a2c2b04f6..4b16171ce 100644
--- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
+++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
@@ -80,7 +80,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t*
float bImag = (float)*b8Ptr++;
lv_32fc_t bVal = lv_cmake( bReal, -bImag );
lv_32fc_t temp = aVal * bVal;
-
+
*cFloatPtr++ = lv_creal(temp) / scalar;
*cFloatPtr++ = lv_cimag(temp) / scalar;
}
@@ -109,7 +109,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t*
float bImag = (float)*b8Ptr++;
lv_32fc_t bVal = lv_cmake( bReal, -bImag );
lv_32fc_t temp = aVal * bVal;
-
+
*cPtr++ = (lv_creal(temp) * invScalar);
*cPtr++ = (lv_cimag(temp) * invScalar);
}