summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/CMakeLists.txt138
-rw-r--r--volk/apps/CMakeLists.txt49
-rw-r--r--volk/apps/volk_profile.cc138
-rw-r--r--volk/cmake/FindORC.cmake36
-rw-r--r--volk/cmake/GrBoost.cmake97
-rw-r--r--volk/cmake/GrPython.cmake233
-rw-r--r--volk/cmake/msvc/config.h58
-rw-r--r--volk/cmake/msvc/inttypes.h301
-rw-r--r--volk/cmake/msvc/stdbool.h45
-rw-r--r--volk/cmake/msvc/stdint.h251
-rw-r--r--volk/gen/archs.xml184
-rw-r--r--volk/gen/machines.xml55
-rw-r--r--volk/gen/volk_arch_defs.py85
-rw-r--r--volk/gen/volk_compile_utils.py58
-rw-r--r--volk/gen/volk_kernel_defs.py209
-rw-r--r--volk/gen/volk_machine_defs.py74
-rw-r--r--volk/gen/volk_tmpl_utils.py74
-rw-r--r--volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h122
-rw-r--r--volk/include/volk/volk_16i_branch_4_state_8_a.h194
-rw-r--r--volk/include/volk/volk_16i_convert_8i_a.h69
-rw-r--r--volk/include/volk/volk_16i_convert_8i_u.h71
-rw-r--r--volk/include/volk/volk_16i_max_star_16i_a.h108
-rw-r--r--volk/include/volk/volk_16i_max_star_horizontal_16i_a.h130
-rw-r--r--volk/include/volk/volk_16i_permute_and_scalar_add_a.h139
-rw-r--r--volk/include/volk/volk_16i_s32f_convert_32f_a.h119
-rw-r--r--volk/include/volk/volk_16i_s32f_convert_32f_u.h122
-rw-r--r--volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h191
-rw-r--r--volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h136
-rw-r--r--volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h158
-rw-r--r--volk/include/volk/volk_16ic_deinterleave_real_16i_a.h120
-rw-r--r--volk/include/volk/volk_16ic_deinterleave_real_8i_a.h94
-rw-r--r--volk/include/volk/volk_16ic_magnitude_16i_a.h191
-rw-r--r--volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h109
-rw-r--r--volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h126
-rw-r--r--volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h180
-rw-r--r--volk/include/volk/volk_16u_byteswap_a.h77
-rw-r--r--volk/include/volk/volk_16u_byteswap_u.h63
-rw-r--r--volk/include/volk/volk_32f_accumulator_s32f_a.h68
-rw-r--r--volk/include/volk/volk_32f_convert_64f_a.h70
-rw-r--r--volk/include/volk/volk_32f_convert_64f_u.h70
-rw-r--r--volk/include/volk/volk_32f_index_max_16u_a.h149
-rw-r--r--volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h120
-rw-r--r--volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h168
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_a.h150
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_u.h152
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_a.h189
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_u.h142
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_8i_a.h155
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_8i_u.h157
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_a.h119
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_u.h102
-rw-r--r--volk/include/volk/volk_32f_s32f_normalize_a.h81
-rw-r--r--volk/include/volk/volk_32f_s32f_power_32f_a.h144
-rw-r--r--volk/include/volk/volk_32f_s32f_stddev_32f_a.h145
-rw-r--r--volk/include/volk/volk_32f_sqrt_32f_a.h77
-rw-r--r--volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h170
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_a.h81
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_u.h66
-rw-r--r--volk/include/volk/volk_32f_x2_divide_32f_a.h82
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_16i_a.h98
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h290
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h290
-rw-r--r--volk/include/volk/volk_32f_x2_interleave_32fc_a.h75
-rw-r--r--volk/include/volk/volk_32f_x2_max_32f_a.h85
-rw-r--r--volk/include/volk/volk_32f_x2_min_32f_a.h85
-rw-r--r--volk/include/volk/volk_32f_x2_multiply_32f_a.h120
-rw-r--r--volk/include/volk/volk_32f_x2_multiply_32f_u.h106
-rw-r--r--volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h156
-rw-r--r--volk/include/volk/volk_32f_x2_subtract_32f_a.h81
-rw-r--r--volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h151
-rw-r--r--volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h111
-rw-r--r--volk/include/volk/volk_32fc_32f_multiply_32fc_a.h95
-rw-r--r--volk/include/volk/volk_32fc_conjugate_32fc_a.h64
-rw-r--r--volk/include/volk/volk_32fc_conjugate_32fc_u.h64
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h75
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h78
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h68
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_real_32f_a.h68
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_real_64f_a.h66
-rw-r--r--volk/include/volk/volk_32fc_index_max_16u_a.h215
-rw-r--r--volk/include/volk/volk_32fc_magnitude_32f_a.h132
-rw-r--r--volk/include/volk/volk_32fc_magnitude_32f_u.h118
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_a.h114
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_u.h114
-rw-r--r--volk/include/volk/volk_32fc_s32f_atan2_32f_a.h158
-rw-r--r--volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h81
-rw-r--r--volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h159
-rw-r--r--volk/include/volk/volk_32fc_s32f_power_32fc_a.h111
-rw-r--r--volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h126
-rw-r--r--volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h134
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h91
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h87
-rw-r--r--volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h74
-rw-r--r--volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h257
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h345
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h145
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h440
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h116
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_32fc_a.h93
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_32fc_u.h77
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h81
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h81
-rw-r--r--volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h126
-rw-r--r--volk/include/volk/volk_32fc_x2_square_dist_32f_a.h112
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_a.h73
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_u.h75
-rw-r--r--volk/include/volk/volk_32i_x2_and_32i_a.h81
-rw-r--r--volk/include/volk/volk_32i_x2_or_32i_a.h81
-rw-r--r--volk/include/volk/volk_32u_byteswap_a.h77
-rw-r--r--volk/include/volk/volk_32u_byteswap_u.h77
-rw-r--r--volk/include/volk/volk_32u_popcnt_a.h36
-rw-r--r--volk/include/volk/volk_64f_convert_32f_a.h67
-rw-r--r--volk/include/volk/volk_64f_convert_32f_u.h67
-rw-r--r--volk/include/volk/volk_64f_x2_max_64f_a.h71
-rw-r--r--volk/include/volk/volk_64f_x2_min_64f_a.h71
-rw-r--r--volk/include/volk/volk_64u_byteswap_a.h88
-rw-r--r--volk/include/volk/volk_64u_byteswap_u.h88
-rw-r--r--volk/include/volk/volk_64u_popcnt_a.h52
-rw-r--r--volk/include/volk/volk_8i_convert_16i_a.h83
-rw-r--r--volk/include/volk/volk_8i_convert_16i_u.h73
-rw-r--r--volk/include/volk/volk_8i_s32f_convert_32f_a.h106
-rw-r--r--volk/include/volk/volk_8i_s32f_convert_32f_u.h94
-rw-r--r--volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h77
-rw-r--r--volk/include/volk/volk_8ic_deinterleave_real_16i_a.h66
-rw-r--r--volk/include/volk/volk_8ic_deinterleave_real_8i_a.h67
-rw-r--r--volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h165
-rw-r--r--volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h134
-rw-r--r--volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h101
-rw-r--r--volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h122
-rw-r--r--volk/include/volk/volk_common.h96
-rw-r--r--volk/include/volk/volk_complex.h86
-rw-r--r--volk/include/volk/volk_prefs.h28
-rw-r--r--volk/kernels/README.txt67
-rw-r--r--volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h122
-rw-r--r--volk/kernels/volk/volk_16i_branch_4_state_8.h194
-rw-r--r--volk/kernels/volk/volk_16i_convert_8i.h140
-rw-r--r--volk/kernels/volk/volk_16i_max_star_16i.h110
-rw-r--r--volk/kernels/volk/volk_16i_max_star_horizontal_16i.h134
-rw-r--r--volk/kernels/volk/volk_16i_permute_and_scalar_add.h142
-rw-r--r--volk/kernels/volk/volk_16i_s32f_convert_32f.h241
-rw-r--r--volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h192
-rw-r--r--volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h140
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h158
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_real_16i.h120
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_real_8i.h94
-rw-r--r--volk/kernels/volk/volk_16ic_magnitude_16i.h191
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h109
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h126
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h180
-rw-r--r--volk/kernels/volk/volk_16u_byteswap.h140
-rw-r--r--volk/kernels/volk/volk_32f_accumulator_s32f.h68
-rw-r--r--volk/kernels/volk/volk_32f_convert_64f.h140
-rw-r--r--volk/kernels/volk/volk_32f_index_max_16u.h149
-rw-r--r--volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h120
-rw-r--r--volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h168
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_16i.h302
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_32i.h331
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_8i.h312
-rw-r--r--volk/kernels/volk/volk_32f_s32f_multiply_32f.h221
-rw-r--r--volk/kernels/volk/volk_32f_s32f_normalize.h81
-rw-r--r--volk/kernels/volk/volk_32f_s32f_power_32f.h144
-rw-r--r--volk/kernels/volk/volk_32f_s32f_stddev_32f.h145
-rw-r--r--volk/kernels/volk/volk_32f_sqrt_32f.h77
-rw-r--r--volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h170
-rw-r--r--volk/kernels/volk/volk_32f_x2_add_32f.h147
-rw-r--r--volk/kernels/volk/volk_32f_x2_divide_32f.h82
-rw-r--r--volk/kernels/volk/volk_32f_x2_dot_prod_16i.h98
-rw-r--r--volk/kernels/volk/volk_32f_x2_dot_prod_32f.h580
-rw-r--r--volk/kernels/volk/volk_32f_x2_interleave_32fc.h75
-rw-r--r--volk/kernels/volk/volk_32f_x2_max_32f.h85
-rw-r--r--volk/kernels/volk/volk_32f_x2_min_32f.h85
-rw-r--r--volk/kernels/volk/volk_32f_x2_multiply_32f.h226
-rw-r--r--volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h156
-rw-r--r--volk/kernels/volk/volk_32f_x2_subtract_32f.h81
-rw-r--r--volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h152
-rw-r--r--volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h111
-rw-r--r--volk/kernels/volk/volk_32fc_32f_multiply_32fc.h95
-rw-r--r--volk/kernels/volk/volk_32fc_conjugate_32fc.h128
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h75
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h156
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h68
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_real_32f.h68
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_real_64f.h66
-rw-r--r--volk/kernels/volk/volk_32fc_index_max_16u.h218
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_32f.h250
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_squared_32f.h228
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_atan2_32f.h158
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h81
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h159
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_power_32fc.h111
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h126
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h134
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h178
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h74
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h257
-rw-r--r--volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h500
-rw-r--r--volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h562
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_32fc.h170
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h162
-rw-r--r--volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h130
-rw-r--r--volk/kernels/volk/volk_32fc_x2_square_dist_32f.h116
-rw-r--r--volk/kernels/volk/volk_32i_s32f_convert_32f.h148
-rw-r--r--volk/kernels/volk/volk_32i_x2_and_32i.h81
-rw-r--r--volk/kernels/volk/volk_32i_x2_or_32i.h81
-rw-r--r--volk/kernels/volk/volk_32u_byteswap.h154
-rw-r--r--volk/kernels/volk/volk_32u_popcnt.h36
-rw-r--r--volk/kernels/volk/volk_64f_convert_32f.h134
-rw-r--r--volk/kernels/volk/volk_64f_x2_max_64f.h71
-rw-r--r--volk/kernels/volk/volk_64f_x2_min_64f.h71
-rw-r--r--volk/kernels/volk/volk_64u_byteswap.h176
-rw-r--r--volk/kernels/volk/volk_64u_popcnt.h52
-rw-r--r--volk/kernels/volk/volk_8i_convert_16i.h156
-rw-r--r--volk/kernels/volk/volk_8i_s32f_convert_32f.h200
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h77
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_real_16i.h66
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_real_8i.h67
-rw-r--r--volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h165
-rw-r--r--volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h134
-rw-r--r--volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h101
-rw-r--r--volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h122
-rw-r--r--volk/lib/CMakeLists.txt352
-rw-r--r--volk/lib/gcc_x86_cpuid.h188
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.cc89
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.h18
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.cc106
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.h18
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc78
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.h18
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.cc60
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.h18
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.cc61
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.h18
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.cc103
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.cc89
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc64
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h18
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc138
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h18
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.cc62
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.h18
-rw-r--r--volk/lib/qa_utils.cc477
-rw-r--r--volk/lib/qa_utils.h41
-rw-r--r--volk/lib/testqa.cc90
-rw-r--r--volk/lib/volk_prefs.c50
-rw-r--r--volk/lib/volk_rank_archs.c112
-rw-r--r--volk/lib/volk_rank_archs.h50
-rw-r--r--volk/orc/volk_16i_s32f_deinterleave_32f_x2_a_orc_impl.orc12
-rw-r--r--volk/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc6
-rw-r--r--volk/orc/volk_16ic_magnitude_16i_a_orc_impl.orc23
-rw-r--r--volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc25
-rw-r--r--volk/orc/volk_16u_byteswap_a_orc_impl.orc3
-rw-r--r--volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_s32f_normalize_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_sqrt_32f_a_orc_impl.orc4
-rw-r--r--volk/orc/volk_32f_x2_add_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_x2_divide_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc6
-rw-r--r--volk/orc/volk_32f_x2_max_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_x2_min_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc7
-rw-r--r--volk/orc/volk_32fc_magnitude_32f_a_orc_impl.orc13
-rw-r--r--volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc23
-rw-r--r--volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc18
-rw-r--r--volk/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc18
-rw-r--r--volk/orc/volk_32i_x2_and_32i_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_32i_x2_or_32i_a_orc_impl.orc5
-rw-r--r--volk/orc/volk_8i_convert_16i_a_orc_impl.orc6
-rw-r--r--volk/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc11
-rw-r--r--volk/spu_lib/gc_spu_macs.h380
-rw-r--r--volk/spu_lib/spu_16s_cmpgt_unaligned.c160
-rw-r--r--volk/spu_lib/spu_16s_vector_subtract_unaligned.c178
-rw-r--r--volk/spu_lib/spu_16s_vector_sum_unaligned.c178
-rw-r--r--volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c222
-rw-r--r--volk/spu_lib/spu_memcpy_unaligned.c290
-rw-r--r--volk/spu_lib/spu_memset_unaligned.S185
-rw-r--r--volk/tmpl/volk.tmpl.c169
-rw-r--r--volk/tmpl/volk.tmpl.h87
-rw-r--r--volk/tmpl/volk_config_fixed.tmpl.h29
-rw-r--r--volk/tmpl/volk_cpu.tmpl.c184
-rw-r--r--volk/tmpl/volk_cpu.tmpl.h42
-rw-r--r--volk/tmpl/volk_machine_xxx.tmpl.c79
-rw-r--r--volk/tmpl/volk_machines.tmpl.c34
-rw-r--r--volk/tmpl/volk_machines.tmpl.h55
-rw-r--r--volk/tmpl/volk_typedefs.tmpl.h32
-rw-r--r--volk/volk.pc.in14
292 files changed, 33662 insertions, 0 deletions
diff --git a/volk/CMakeLists.txt b/volk/CMakeLists.txt
new file mode 100644
index 000000000..a66c9ca30
--- /dev/null
+++ b/volk/CMakeLists.txt
@@ -0,0 +1,138 @@
+#
+# Copyright 2011 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# Project setup
+########################################################################
+cmake_minimum_required(VERSION 2.6)
+if(NOT DEFINED CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel")
+project(volk)
+enable_language(CXX)
+enable_language(C)
+enable_testing()
+set(VERSION 0.1)
+set(LIBVER 0.0.0)
+
+set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project
+set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) #location for custom "Modules"
+
+########################################################################
+# Environment setup
+########################################################################
+IF(NOT DEFINED BOOST_ROOT)
+ SET(BOOST_ROOT ${CMAKE_INSTALL_PREFIX})
+ENDIF()
+
+IF(NOT DEFINED CROSSCOMPILE_MULTILIB)
+ SET(CROSSCOMPILE_MULTILIB "")
+ENDIF()
+SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile")
+
+
+########################################################################
+# Dependencies setup
+########################################################################
+include(GrPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
+GR_PYTHON_CHECK_MODULE("python >= 2.5" sys "sys.version.split()[0] >= '2.5'" PYTHON_MIN_VER_FOUND)
+GR_PYTHON_CHECK_MODULE("Cheetah >= 2.0.0" Cheetah "Cheetah.Version >= '2.0.0'" CHEETAH_FOUND)
+
+if(NOT PYTHON_MIN_VER_FOUND)
+ message(FATAL_ERROR "Python 2.5 or greater required to build VOLK")
+endif()
+
+if(NOT CHEETAH_FOUND)
+ message(FATAL_ERROR "Cheetah templates required to build VOLK")
+endif()
+
+if(MSVC)
+ if (NOT DEFINED BOOST_ALL_DYN_LINK)
+ set(BOOST_ALL_DYN_LINK TRUE)
+ endif()
+ set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
+ if(BOOST_ALL_DYN_LINK)
+ add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
+ else(BOOST_ALL_DYN_LINK)
+ unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
+ endif(BOOST_ALL_DYN_LINK)
+endif(MSVC)
+include(GrBoost)
+
+if(NOT Boost_FOUND)
+ message(FATAL_ERROR "VOLK Requires boost to build")
+endif()
+
+find_package(ORC)
+
+########################################################################
+# Setup the package config file
+########################################################################
+#set variables found in the pc.in file
+set(prefix ${CMAKE_INSTALL_PREFIX})
+set(exec_prefix "\${prefix}")
+set(libdir "\${exec_prefix}/lib${LIB_SUFFIX}")
+set(includedir "\${prefix}/include")
+
+configure_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk.pc.in
+ ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+@ONLY)
+
+install(
+ FILES ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+ DESTINATION lib${LIB_SUFFIX}/pkgconfig
+ COMPONENT "volk_devel"
+)
+
+########################################################################
+# Install all headers in the include directories
+########################################################################
+install(
+ DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk
+ DESTINATION include COMPONENT "volk_devel"
+ FILES_MATCHING PATTERN "*.h"
+)
+
+install(FILES
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h
+ DESTINATION include/volk
+ COMPONENT "volk_devel"
+)
+
+########################################################################
+# Setup the library
+########################################################################
+add_subdirectory(lib)
+
+########################################################################
+# And the utility apps
+########################################################################
+add_subdirectory(apps)
+
+########################################################################
+# Print summary
+########################################################################
+message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
diff --git a/volk/apps/CMakeLists.txt b/volk/apps/CMakeLists.txt
new file mode 100644
index 000000000..f847dd624
--- /dev/null
+++ b/volk/apps/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# Copyright 2011-2013 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# Setup profiler
+########################################################################
+if(Boost_FOUND)
+
+if(MSVC)
+ include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
+endif(MSVC)
+
+include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/include
+ ${CMAKE_BINARY_DIR}/include
+ ${CMAKE_SOURCE_DIR}/lib
+ ${Boost_INCLUDE_DIRS}
+)
+
+add_executable(volk_profile
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc
+ ${CMAKE_SOURCE_DIR}/lib/qa_utils.cc
+)
+
+target_link_libraries(volk_profile volk ${Boost_LIBRARIES})
+
+install(
+ TARGETS volk_profile
+ DESTINATION bin
+ COMPONENT "volk"
+)
+
+endif(Boost_FOUND)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
new file mode 100644
index 000000000..fa89a93bf
--- /dev/null
+++ b/volk/apps/volk_profile.cc
@@ -0,0 +1,138 @@
+#include "qa_utils.h"
+
+#include <volk/volk.h>
+#include <volk/volk_prefs.h>
+
+#include <ciso646>
+#include <vector>
+#include <boost/foreach.hpp>
+#include <boost/filesystem.hpp>
+#include <iostream>
+#include <fstream>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace fs = boost::filesystem;
+
+int main(int argc, char *argv[]) {
+
+ std::vector<std::string> results;
+
+ //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results);
+ //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results);
+ VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
+ VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_16i_x2, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_real_16i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_magnitude_16i, 1, 0, 204600, 100, &results);
+ VOLK_PROFILE(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16i_convert_8i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results);
+ //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204600, 50, &results);
+ VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204600, 100, &results);
+ //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_real_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_real_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_index_max_16u, 3, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_magnitude_16i, 1, 32768, 204600, 100, &results);
+ VOLK_PROFILE(volk_32fc_magnitude_32f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_magnitude_squared_32f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_16i, 1, 32768, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_32i, 1, 2<<31, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_convert_64f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_8i, 1, 128, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 20460, 100, &results);
+ VOLK_PROFILE(volk_32fc_x2_square_dist_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_x2_divide_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 5000, &results);
+ //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32f_index_max_16u, 3, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic, 1, 32768, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_x2_interleave_32fc, 0, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_max_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_min_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_multiply_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_normalize, 1e-4, 100, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_power_32f, 1e-4, 4, 204600, 100, &results);
+ VOLK_PROFILE(volk_32f_sqrt_32f, 1e-4, 0, 204600, 100, &results);
+ VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204600, 2000, &results);
+ //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204600, 1000, &results);
+ //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 204600, 400, &results);
+ VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 204600, 400, &results);
+ VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 20000, &results);
+ VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_8i_s32f_convert_32f, 1e-4, 100, 204600, 2000, &results);
+ //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 0, 204600, 1000, &results);
+
+
+ char path[1024];
+ volk_get_config_path(path);
+ const fs::path config_path(path);
+
+ if (not fs::exists(config_path.branch_path()))
+ {
+ std::cout << "Creating " << config_path.branch_path() << "..." << std::endl;
+ fs::create_directories(config_path.branch_path());
+ }
+
+ std::cout << "Writing " << config_path << "..." << std::endl;
+ std::ofstream config(config_path.string().c_str());
+ if(!config.is_open()) { //either we don't have write access or we don't have the dir yet
+ std::cout << "Error opening file " << config_path << std::endl;
+ }
+
+ config << "\
+#this file is generated by volk_profile.\n\
+#the function name is followed by the preferred architecture.\n\
+";
+
+ BOOST_FOREACH(std::string result, results) {
+ config << result << std::endl;
+ }
+ config.close();
+}
diff --git a/volk/cmake/FindORC.cmake b/volk/cmake/FindORC.cmake
new file mode 100644
index 000000000..a5f35c465
--- /dev/null
+++ b/volk/cmake/FindORC.cmake
@@ -0,0 +1,36 @@
+FIND_PACKAGE(PkgConfig)
+PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
+
+
+
+
+FIND_PROGRAM(ORCC_EXECUTABLE orcc
+ HINTS ${PC_ORC_TOOLSDIR}
+ PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin)
+
+FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h
+ HINTS ${PC_ORC_INCLUDEDIR}
+ PATHS ${ORC_ROOT}/include/orc-0.4 ${CMAKE_INSTALL_PREFIX}/include/orc-0.4)
+
+
+FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX}
+ HINTS ${PC_ORC_LIBDIR}
+ PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+FIND_LIBRARY(ORC_LIB orc-0.4
+ HINTS ${PC_ORC_LIBRARY_DIRS}
+ PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+LIST(APPEND ORC_LIBRARY
+ ${ORC_LIB}
+)
+
+
+SET(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
+SET(ORC_LIBRARIES ${ORC_LIBRARY})
+SET(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR})
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR ORCC_EXECUTABLE)
+
+mark_as_advanced(ICE_INCLUDE_DIR ICE_LIBRARY ORCC_EXECUTABLE)
diff --git a/volk/cmake/GrBoost.cmake b/volk/cmake/GrBoost.cmake
new file mode 100644
index 000000000..57db9db37
--- /dev/null
+++ b/volk/cmake/GrBoost.cmake
@@ -0,0 +1,97 @@
+# Copyright 2010-2011 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_GR_BOOST_CMAKE)
+ return()
+endif()
+set(__INCLUDED_GR_BOOST_CMAKE TRUE)
+
+########################################################################
+# Setup Boost and handle some system specific things
+########################################################################
+
+set(BOOST_REQUIRED_COMPONENTS
+ filesystem
+ system
+ unit_test_framework
+)
+
+if(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
+ list(APPEND BOOST_LIBRARYDIR "/usr/lib64") #fedora 64-bit fix
+endif(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
+
+if(MSVC)
+ set(BOOST_REQUIRED_COMPONENTS ${BOOST_REQUIRED_COMPONENTS} chrono)
+
+ if (NOT DEFINED BOOST_ALL_DYN_LINK)
+ set(BOOST_ALL_DYN_LINK TRUE)
+ endif()
+ set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
+ if(BOOST_ALL_DYN_LINK)
+ add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
+ else(BOOST_ALL_DYN_LINK)
+ unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
+ endif(BOOST_ALL_DYN_LINK)
+endif(MSVC)
+
+find_package(Boost "1.35" COMPONENTS ${BOOST_REQUIRED_COMPONENTS})
+
+# This does not allow us to disable specific versions. It is used
+# internally by cmake to know the formation newer versions. As newer
+# Boost version beyond what is shown here are produced, we must extend
+# this list. To disable Boost versions, see below.
+set(Boost_ADDITIONAL_VERSIONS
+ "1.35.0" "1.35" "1.36.0" "1.36" "1.37.0" "1.37" "1.38.0" "1.38" "1.39.0" "1.39"
+ "1.40.0" "1.40" "1.41.0" "1.41" "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44"
+ "1.45.0" "1.45" "1.46.0" "1.46" "1.47.0" "1.47" "1.48.0" "1.48" "1.49.0" "1.49"
+ "1.50.0" "1.50" "1.51.0" "1.51" "1.52.0" "1.52" "1.53.0" "1.53" "1.54.0" "1.54"
+ "1.55.0" "1.55" "1.56.0" "1.56" "1.57.0" "1.57" "1.58.0" "1.58" "1.59.0" "1.59"
+ "1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64"
+ "1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69"
+)
+
+# Boost 1.52 disabled, see https://svn.boost.org/trac/boost/ticket/7669
+# Similar problems with Boost 1.46 and 1.47.
+
+OPTION(ENABLE_BAD_BOOST "Enable known bad versions of Boost" OFF)
+if(ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "Enabling use of known bad versions of Boost.")
+endif(ENABLE_BAD_BOOST)
+
+# For any unsuitable Boost version, add the version number below in
+# the following format: XXYYZZ
+# Where:
+# XX is the major version ('10' for version 1)
+# YY is the minor version number ('46' for 1.46)
+# ZZ is the patcher version number (typically just '00')
+set(Boost_NOGO_VERSIONS
+ 104600 104601 104700 105200
+ )
+
+foreach(ver ${Boost_NOGO_VERSIONS})
+ if(${Boost_VERSION} EQUAL ${ver})
+ if(NOT ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Disabling.")
+ set(Boost_FOUND FALSE)
+ else(NOT ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Continuing anyway.")
+ set(Boost_FOUND TRUE)
+ endif(NOT ENABLE_BAD_BOOST)
+ endif(${Boost_VERSION} EQUAL ${ver})
+endforeach(ver)
diff --git a/volk/cmake/GrPython.cmake b/volk/cmake/GrPython.cmake
new file mode 100644
index 000000000..eff9cbcdc
--- /dev/null
+++ b/volk/cmake/GrPython.cmake
@@ -0,0 +1,233 @@
+# Copyright 2010-2011 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_GR_PYTHON_CMAKE)
+ return()
+endif()
+set(__INCLUDED_GR_PYTHON_CMAKE TRUE)
+
+########################################################################
+# Setup the python interpreter:
+# This allows the user to specify a specific interpreter,
+# or finds the interpreter via the built-in cmake module.
+########################################################################
+#this allows the user to override PYTHON_EXECUTABLE
+if(PYTHON_EXECUTABLE)
+
+ set(PYTHONINTERP_FOUND TRUE)
+
+#otherwise if not set, try to automatically find it
+else(PYTHON_EXECUTABLE)
+
+ #use the built-in find script
+ find_package(PythonInterp)
+
+ #and if that fails use the find program routine
+ if(NOT PYTHONINTERP_FOUND)
+ find_program(PYTHON_EXECUTABLE NAMES python python2.7 python2.6 python2.5)
+ if(PYTHON_EXECUTABLE)
+ set(PYTHONINTERP_FOUND TRUE)
+ endif(PYTHON_EXECUTABLE)
+ endif(NOT PYTHONINTERP_FOUND)
+
+endif(PYTHON_EXECUTABLE)
+
+#make the path to the executable appear in the cmake gui
+set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "python interpreter")
+
+#make sure we can use -B with python (introduced in 2.6)
+if(PYTHON_EXECUTABLE)
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} -B -c ""
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE PYTHON_HAS_DASH_B_RESULT
+ )
+ if(PYTHON_HAS_DASH_B_RESULT EQUAL 0)
+ set(PYTHON_DASH_B "-B")
+ endif()
+endif(PYTHON_EXECUTABLE)
+
+########################################################################
+# Check for the existence of a python module:
+# - desc a string description of the check
+# - mod the name of the module to import
+# - cmd an additional command to run
+# - have the result variable to set
+########################################################################
+macro(GR_PYTHON_CHECK_MODULE desc mod cmd have)
+ message(STATUS "")
+ message(STATUS "Python checking for ${desc}")
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} -c "
+#########################################
+try: import ${mod}
+except:
+ try: ${mod}
+ except: exit(-1)
+try: assert ${cmd}
+except: exit(-1)
+#########################################"
+ RESULT_VARIABLE ${have}
+ )
+ if(${have} EQUAL 0)
+ message(STATUS "Python checking for ${desc} - found")
+ set(${have} TRUE)
+ else(${have} EQUAL 0)
+ message(STATUS "Python checking for ${desc} - not found")
+ set(${have} FALSE)
+ endif(${have} EQUAL 0)
+endmacro(GR_PYTHON_CHECK_MODULE)
+
+########################################################################
+# Sets the python installation directory GR_PYTHON_DIR
+########################################################################
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "
+from distutils import sysconfig
+print sysconfig.get_python_lib(plat_specific=True, prefix='')
+" OUTPUT_VARIABLE GR_PYTHON_DIR OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+file(TO_CMAKE_PATH ${GR_PYTHON_DIR} GR_PYTHON_DIR)
+
+########################################################################
+# Create an always-built target with a unique name
+# Usage: GR_UNIQUE_TARGET(<description> <dependencies list>)
+########################################################################
+function(GR_UNIQUE_TARGET desc)
+ file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+ execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib
+unique = hashlib.md5('${reldir}${ARGN}').hexdigest()[:5]
+print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))"
+ OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE)
+ add_custom_target(${_target} ALL DEPENDS ${ARGN})
+endfunction(GR_UNIQUE_TARGET)
+
+########################################################################
+# Install python sources (also builds and installs byte-compiled python)
+########################################################################
+function(GR_PYTHON_INSTALL)
+ include(CMakeParseArgumentsCopy)
+ CMAKE_PARSE_ARGUMENTS(GR_PYTHON_INSTALL "" "DESTINATION;COMPONENT" "FILES;PROGRAMS" ${ARGN})
+
+ ####################################################################
+ if(GR_PYTHON_INSTALL_FILES)
+ ####################################################################
+ install(${ARGN}) #installs regular python files
+
+ #create a list of all generated files
+ unset(pysrcfiles)
+ unset(pycfiles)
+ unset(pyofiles)
+ foreach(pyfile ${GR_PYTHON_INSTALL_FILES})
+ get_filename_component(pyfile ${pyfile} ABSOLUTE)
+ list(APPEND pysrcfiles ${pyfile})
+
+ #determine if this file is in the source or binary directory
+ file(RELATIVE_PATH source_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${pyfile})
+ string(LENGTH "${source_rel_path}" source_rel_path_len)
+ file(RELATIVE_PATH binary_rel_path ${CMAKE_CURRENT_BINARY_DIR} ${pyfile})
+ string(LENGTH "${binary_rel_path}" binary_rel_path_len)
+
+ #and set the generated path appropriately
+ if(${source_rel_path_len} GREATER ${binary_rel_path_len})
+ set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${binary_rel_path})
+ else()
+ set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${source_rel_path})
+ endif()
+ list(APPEND pycfiles ${pygenfile}c)
+ list(APPEND pyofiles ${pygenfile}o)
+
+ #ensure generation path exists
+ get_filename_component(pygen_path ${pygenfile} PATH)
+ file(MAKE_DIRECTORY ${pygen_path})
+
+ endforeach(pyfile)
+
+ #the command to generate the pyc files
+ add_custom_command(
+ DEPENDS ${pysrcfiles} OUTPUT ${pycfiles}
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pycfiles}
+ )
+
+ #the command to generate the pyo files
+ add_custom_command(
+ DEPENDS ${pysrcfiles} OUTPUT ${pyofiles}
+ COMMAND ${PYTHON_EXECUTABLE} -O ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pyofiles}
+ )
+
+ #create install rule and add generated files to target list
+ set(python_install_gen_targets ${pycfiles} ${pyofiles})
+ install(FILES ${python_install_gen_targets}
+ DESTINATION ${GR_PYTHON_INSTALL_DESTINATION}
+ COMPONENT ${GR_PYTHON_INSTALL_COMPONENT}
+ )
+
+
+ ####################################################################
+ elseif(GR_PYTHON_INSTALL_PROGRAMS)
+ ####################################################################
+ file(TO_NATIVE_PATH ${PYTHON_EXECUTABLE} pyexe_native)
+
+ if (CMAKE_CROSSCOMPILING)
+ set(pyexe_native /usr/bin/env python)
+ endif()
+
+ foreach(pyfile ${GR_PYTHON_INSTALL_PROGRAMS})
+ get_filename_component(pyfile_name ${pyfile} NAME)
+ get_filename_component(pyfile ${pyfile} ABSOLUTE)
+ string(REPLACE "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" pyexefile "${pyfile}.exe")
+ list(APPEND python_install_gen_targets ${pyexefile})
+
+ get_filename_component(pyexefile_path ${pyexefile} PATH)
+ file(MAKE_DIRECTORY ${pyexefile_path})
+
+ add_custom_command(
+ OUTPUT ${pyexefile} DEPENDS ${pyfile}
+ COMMAND ${PYTHON_EXECUTABLE} -c
+ \"open('${pyexefile}', 'w').write('\#!${pyexe_native}\\n'+open('${pyfile}').read())\"
+ COMMENT "Shebangin ${pyfile_name}"
+ )
+
+ #on windows, python files need an extension to execute
+ get_filename_component(pyfile_ext ${pyfile} EXT)
+ if(WIN32 AND NOT pyfile_ext)
+ set(pyfile_name "${pyfile_name}.py")
+ endif()
+
+ install(PROGRAMS ${pyexefile} RENAME ${pyfile_name}
+ DESTINATION ${GR_PYTHON_INSTALL_DESTINATION}
+ COMPONENT ${GR_PYTHON_INSTALL_COMPONENT}
+ )
+ endforeach(pyfile)
+
+ endif()
+
+ GR_UNIQUE_TARGET("pygen" ${python_install_gen_targets})
+
+endfunction(GR_PYTHON_INSTALL)
+
+########################################################################
+# Write the python helper script that generates byte code files
+########################################################################
+file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py "
+import sys, py_compile
+files = sys.argv[1:]
+srcs, gens = files[:len(files)/2], files[len(files)/2:]
+for src, gen in zip(srcs, gens):
+ py_compile.compile(file=src, cfile=gen, doraise=True)
+")
diff --git a/volk/cmake/msvc/config.h b/volk/cmake/msvc/config.h
new file mode 100644
index 000000000..43792c783
--- /dev/null
+++ b/volk/cmake/msvc/config.h
@@ -0,0 +1,58 @@
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_CONFIG_H_ // [
+#define _MSC_CONFIG_H_
+
+////////////////////////////////////////////////////////////////////////
+// enable inline functions for C code
+////////////////////////////////////////////////////////////////////////
+#ifndef __cplusplus
+# define inline __inline
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// signed size_t
+////////////////////////////////////////////////////////////////////////
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
+
+////////////////////////////////////////////////////////////////////////
+// rint functions
+////////////////////////////////////////////////////////////////////////
+#include <math.h>
+static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
+static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
+
+////////////////////////////////////////////////////////////////////////
+// math constants
+////////////////////////////////////////////////////////////////////////
+#define INFINITY HUGE_VAL
+
+# define M_E 2.7182818284590452354 /* e */
+# define M_LOG2E 1.4426950408889634074 /* log_2 e */
+# define M_LOG10E 0.43429448190325182765 /* log_10 e */
+# define M_LN2 0.69314718055994530942 /* log_e 2 */
+# define M_LN10 2.30258509299404568402 /* log_e 10 */
+# define M_PI 3.14159265358979323846 /* pi */
+# define M_PI_2 1.57079632679489661923 /* pi/2 */
+# define M_PI_4 0.78539816339744830962 /* pi/4 */
+# define M_1_PI 0.31830988618379067154 /* 1/pi */
+# define M_2_PI 0.63661977236758134308 /* 2/pi */
+# define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
+# define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
+# define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+////////////////////////////////////////////////////////////////////////
+// random and srandom
+////////////////////////////////////////////////////////////////////////
+#include <stdlib.h>
+static inline long int random (void) { return rand(); }
+static inline void srandom (unsigned int seed) { srand(seed); }
+
+#endif // _MSC_CONFIG_H_ ]
diff --git a/volk/cmake/msvc/inttypes.h b/volk/cmake/msvc/inttypes.h
new file mode 100644
index 000000000..0a1b60fc1
--- /dev/null
+++ b/volk/cmake/msvc/inttypes.h
@@ -0,0 +1,301 @@
+// ISO C9x compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <stdint.h>
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+ intmax_t quot;
+ intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+// The fprintf macros for signed integers are:
+#define PRId8 "d"
+#define PRIi8 "i"
+#define PRIdLEAST8 "d"
+#define PRIiLEAST8 "i"
+#define PRIdFAST8 "d"
+#define PRIiFAST8 "i"
+
+#define PRId16 "hd"
+#define PRIi16 "hi"
+#define PRIdLEAST16 "hd"
+#define PRIiLEAST16 "hi"
+#define PRIdFAST16 "hd"
+#define PRIiFAST16 "hi"
+
+#define PRId32 "I32d"
+#define PRIi32 "I32i"
+#define PRIdLEAST32 "I32d"
+#define PRIiLEAST32 "I32i"
+#define PRIdFAST32 "I32d"
+#define PRIiFAST32 "I32i"
+
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIdLEAST64 "I64d"
+#define PRIiLEAST64 "I64i"
+#define PRIdFAST64 "I64d"
+#define PRIiFAST64 "I64i"
+
+#define PRIdMAX "I64d"
+#define PRIiMAX "I64i"
+
+#define PRIdPTR "Id"
+#define PRIiPTR "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8 "o"
+#define PRIu8 "u"
+#define PRIx8 "x"
+#define PRIX8 "X"
+#define PRIoLEAST8 "o"
+#define PRIuLEAST8 "u"
+#define PRIxLEAST8 "x"
+#define PRIXLEAST8 "X"
+#define PRIoFAST8 "o"
+#define PRIuFAST8 "u"
+#define PRIxFAST8 "x"
+#define PRIXFAST8 "X"
+
+#define PRIo16 "ho"
+#define PRIu16 "hu"
+#define PRIx16 "hx"
+#define PRIX16 "hX"
+#define PRIoLEAST16 "ho"
+#define PRIuLEAST16 "hu"
+#define PRIxLEAST16 "hx"
+#define PRIXLEAST16 "hX"
+#define PRIoFAST16 "ho"
+#define PRIuFAST16 "hu"
+#define PRIxFAST16 "hx"
+#define PRIXFAST16 "hX"
+
+#define PRIo32 "I32o"
+#define PRIu32 "I32u"
+#define PRIx32 "I32x"
+#define PRIX32 "I32X"
+#define PRIoLEAST32 "I32o"
+#define PRIuLEAST32 "I32u"
+#define PRIxLEAST32 "I32x"
+#define PRIXLEAST32 "I32X"
+#define PRIoFAST32 "I32o"
+#define PRIuFAST32 "I32u"
+#define PRIxFAST32 "I32x"
+#define PRIXFAST32 "I32X"
+
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#define PRIoLEAST64 "I64o"
+#define PRIuLEAST64 "I64u"
+#define PRIxLEAST64 "I64x"
+#define PRIXLEAST64 "I64X"
+#define PRIoFAST64 "I64o"
+#define PRIuFAST64 "I64u"
+#define PRIxFAST64 "I64x"
+#define PRIXFAST64 "I64X"
+
+#define PRIoMAX "I64o"
+#define PRIuMAX "I64u"
+#define PRIxMAX "I64x"
+#define PRIXMAX "I64X"
+
+#define PRIoPTR "Io"
+#define PRIuPTR "Iu"
+#define PRIxPTR "Ix"
+#define PRIXPTR "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8 "d"
+#define SCNi8 "i"
+#define SCNdLEAST8 "d"
+#define SCNiLEAST8 "i"
+#define SCNdFAST8 "d"
+#define SCNiFAST8 "i"
+
+#define SCNd16 "hd"
+#define SCNi16 "hi"
+#define SCNdLEAST16 "hd"
+#define SCNiLEAST16 "hi"
+#define SCNdFAST16 "hd"
+#define SCNiFAST16 "hi"
+
+#define SCNd32 "ld"
+#define SCNi32 "li"
+#define SCNdLEAST32 "ld"
+#define SCNiLEAST32 "li"
+#define SCNdFAST32 "ld"
+#define SCNiFAST32 "li"
+
+#define SCNd64 "I64d"
+#define SCNi64 "I64i"
+#define SCNdLEAST64 "I64d"
+#define SCNiLEAST64 "I64i"
+#define SCNdFAST64 "I64d"
+#define SCNiFAST64 "I64i"
+
+#define SCNdMAX "I64d"
+#define SCNiMAX "I64i"
+
+#ifdef _WIN64 // [
+# define SCNdPTR "I64d"
+# define SCNiPTR "I64i"
+#else // _WIN64 ][
+# define SCNdPTR "ld"
+# define SCNiPTR "li"
+#endif // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8 "o"
+#define SCNu8 "u"
+#define SCNx8 "x"
+#define SCNX8 "X"
+#define SCNoLEAST8 "o"
+#define SCNuLEAST8 "u"
+#define SCNxLEAST8 "x"
+#define SCNXLEAST8 "X"
+#define SCNoFAST8 "o"
+#define SCNuFAST8 "u"
+#define SCNxFAST8 "x"
+#define SCNXFAST8 "X"
+
+#define SCNo16 "ho"
+#define SCNu16 "hu"
+#define SCNx16 "hx"
+#define SCNX16 "hX"
+#define SCNoLEAST16 "ho"
+#define SCNuLEAST16 "hu"
+#define SCNxLEAST16 "hx"
+#define SCNXLEAST16 "hX"
+#define SCNoFAST16 "ho"
+#define SCNuFAST16 "hu"
+#define SCNxFAST16 "hx"
+#define SCNXFAST16 "hX"
+
+#define SCNo32 "lo"
+#define SCNu32 "lu"
+#define SCNx32 "lx"
+#define SCNX32 "lX"
+#define SCNoLEAST32 "lo"
+#define SCNuLEAST32 "lu"
+#define SCNxLEAST32 "lx"
+#define SCNXLEAST32 "lX"
+#define SCNoFAST32 "lo"
+#define SCNuFAST32 "lu"
+#define SCNxFAST32 "lx"
+#define SCNXFAST32 "lX"
+
+#define SCNo64 "I64o"
+#define SCNu64 "I64u"
+#define SCNx64 "I64x"
+#define SCNX64 "I64X"
+#define SCNoLEAST64 "I64o"
+#define SCNuLEAST64 "I64u"
+#define SCNxLEAST64 "I64x"
+#define SCNXLEAST64 "I64X"
+#define SCNoFAST64 "I64o"
+#define SCNuFAST64 "I64u"
+#define SCNxFAST64 "I64x"
+#define SCNXFAST64 "I64X"
+
+#define SCNoMAX "I64o"
+#define SCNuMAX "I64u"
+#define SCNxMAX "I64x"
+#define SCNXMAX "I64X"
+
+#ifdef _WIN64 // [
+# define SCNoPTR "I64o"
+# define SCNuPTR "I64u"
+# define SCNxPTR "I64x"
+# define SCNXPTR "I64X"
+#else // _WIN64 ][
+# define SCNoPTR "lo"
+# define SCNuPTR "lu"
+# define SCNxPTR "lx"
+# define SCNXPTR "lX"
+#endif // _WIN64 ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+ imaxdiv_t result;
+
+ result.quot = numer / denom;
+ result.rem = numer % denom;
+
+ if (numer < 0 && result.rem > 0) {
+ // did division wrong; must fix up
+ ++result.quot;
+ result.rem -= denom;
+ }
+
+ return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/volk/cmake/msvc/stdbool.h b/volk/cmake/msvc/stdbool.h
new file mode 100644
index 000000000..ca4581d37
--- /dev/null
+++ b/volk/cmake/msvc/stdbool.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2005, 2006 Apple Computer, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef STDBOOL_WIN32_H
+#define STDBOOL_WIN32_H
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef __cplusplus
+
+typedef unsigned char bool;
+
+#define true 1
+#define false 0
+
+#ifndef CASSERT
+#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1];
+#endif
+
+CASSERT(sizeof(bool) == 1, bool_is_one_byte)
+CASSERT(true, true_is_true)
+CASSERT(!false, false_is_false)
+
+#endif
+
+#endif
diff --git a/volk/cmake/msvc/stdint.h b/volk/cmake/msvc/stdint.h
new file mode 100644
index 000000000..108bc8982
--- /dev/null
+++ b/volk/cmake/msvc/stdint.h
@@ -0,0 +1,251 @@
+// ISO C9x compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006-2008 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+# include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+# define _W64 __w64
+# else
+# define _W64
+# endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+ typedef signed char int8_t;
+ typedef signed short int16_t;
+ typedef signed int int32_t;
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+#else
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+#endif
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t int_least8_t;
+typedef int16_t int_least16_t;
+typedef int32_t int_least32_t;
+typedef int64_t int_least64_t;
+typedef uint8_t uint_least8_t;
+typedef uint16_t uint_least16_t;
+typedef uint32_t uint_least32_t;
+typedef uint64_t uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t int_fast8_t;
+typedef int16_t int_fast16_t;
+typedef int32_t int_fast32_t;
+typedef int64_t int_fast64_t;
+typedef uint8_t uint_fast8_t;
+typedef uint16_t uint_fast16_t;
+typedef uint32_t uint_fast32_t;
+typedef uint64_t uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+ typedef signed __int64 intptr_t;
+ typedef unsigned __int64 uintptr_t;
+#else // _WIN64 ][
+ typedef _W64 signed int intptr_t;
+ typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t intmax_t;
+typedef uint64_t uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN ((int8_t)_I8_MIN)
+#define INT8_MAX _I8_MAX
+#define INT16_MIN ((int16_t)_I16_MIN)
+#define INT16_MAX _I16_MAX
+#define INT32_MIN ((int32_t)_I32_MIN)
+#define INT32_MAX _I32_MAX
+#define INT64_MIN ((int64_t)_I64_MIN)
+#define INT64_MAX _I64_MAX
+#define UINT8_MAX _UI8_MAX
+#define UINT16_MAX _UI16_MAX
+#define UINT32_MAX _UI32_MAX
+#define UINT64_MAX _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MIN INT64_MIN
+#define INT_LEAST64_MAX INT64_MAX
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MIN INT64_MIN
+#define INT_FAST64_MAX INT64_MAX
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+# define INTPTR_MIN INT64_MIN
+# define INTPTR_MAX INT64_MAX
+# define UINTPTR_MAX UINT64_MAX
+#else // _WIN64 ][
+# define INTPTR_MIN INT32_MIN
+# define INTPTR_MAX INT32_MAX
+# define UINTPTR_MAX UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+# define PTRDIFF_MIN _I64_MIN
+# define PTRDIFF_MAX _I64_MAX
+#else // _WIN64 ][
+# define PTRDIFF_MIN _I32_MIN
+# define PTRDIFF_MAX _I32_MAX
+#endif // _WIN64 ]
+
+#define SIG_ATOMIC_MIN INT_MIN
+#define SIG_ATOMIC_MAX INT_MAX
+
+#ifndef SIZE_MAX // [
+# ifdef _WIN64 // [
+# define SIZE_MAX _UI64_MAX
+# else // _WIN64 ][
+# define SIZE_MAX _UI32_MAX
+# endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+# define WCHAR_MIN 0
+#endif // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+# define WCHAR_MAX _UI16_MAX
+#endif // WCHAR_MAX ]
+
+#define WINT_MIN 0
+#define WINT_MAX _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val) val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val) val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#ifndef INTMAX_C
+#define INTMAX_C INT64_C
+#endif
+#ifndef UINTMAX_C
+#define UINTMAX_C UINT64_C
+#endif
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml
new file mode 100644
index 000000000..2c9ab41a5
--- /dev/null
+++ b/volk/gen/archs.xml
@@ -0,0 +1,184 @@
+<!-- archs appear in order of significance for blind, de-facto version ordering -->
+<grammar>
+
+<arch name="generic"> <!-- name is required-->
+</arch>
+
+<arch name="altivec">
+ <flag compiler="gnu">-maltivec</flag>
+ <alignment>16</alignment>
+ <check name="has_ppc"></check>
+</arch>
+
+<arch name="neon">
+ <flag compiler="gnu">-mfpu=neon</flag>
+ <flag compiler="gnu">-mfloat-abi=softfp</flag>
+ <flag compiler="gnu">-funsafe-math-optimizations</flag>
+ <alignment>16</alignment>
+ <check name="has_neon"></check>
+</arch>
+
+<arch name="32">
+ <flag compiler="gnu">-m32</flag>
+</arch>
+
+<arch name="64">
+ <check name="check_extended_cpuid">
+ <param>0x80000001</param>
+ </check>
+ <check name="cpuid_x86_bit"> <!-- checks to see if a bit is set -->
+ <param>3</param> <!-- eax, ebx, ecx, [edx] -->
+ <param>0x80000001</param> <!-- cpuid operation -->
+ <param>29</param> <!-- bit shift -->
+ </check>
+ <flag compiler="gnu">-m64</flag>
+</arch>
+
+<arch name="3dnow">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x80000001</param>
+ <param>31</param>
+ </check>
+ <flag compiler="gnu">-m3dnow</flag>
+ <alignment>8</alignment>
+</arch>
+
+<arch name="abm">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x80000001</param>
+ <param>5</param>
+ </check>
+ <flag compiler="gnu">-msse4.2</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="popcount">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>23</param>
+ </check>
+ <flag compiler="gnu">-mpopcnt</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+</arch>
+
+<arch name="mmx">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>23</param>
+ </check>
+ <flag compiler="gnu">-mmmx</flag>
+ <flag compiler="msvc">/arch:SSE</flag>
+ <alignment>8</alignment>
+</arch>
+
+<arch name="sse">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>25</param>
+ </check>
+ <flag compiler="gnu">-msse</flag>
+ <flag compiler="msvc">/arch:SSE</flag>
+ <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
+ <include>xmmintrin.h</include>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse2">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>26</param>
+ </check>
+ <flag compiler="gnu">-msse2</flag>
+ <flag compiler="msvc">/arch:SSE2</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="orc">
+</arch>
+
+<!-- it's here for overrule stuff. -->
+<arch name="norc">
+</arch>
+
+<arch name="sse3">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>0</param>
+ </check>
+ <flag compiler="gnu">-msse3</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <environment>_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);</environment>
+ <include>pmmintrin.h</include>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="ssse3">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>9</param>
+ </check>
+ <flag compiler="gnu">-mssse3</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_a">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x80000001</param>
+ <param>6</param>
+ </check>
+ <flag compiler="gnu">-msse4a</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_1">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>19</param>
+ </check>
+ <flag compiler="gnu">-msse4.1</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_2">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>20</param>
+ </check>
+ <flag compiler="gnu">-msse4.2</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="avx">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>28</param>
+ </check>
+ <!-- check to make sure that xgetbv is enabled in OS -->
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>27</param>
+ </check>
+ <!-- check to see that the OS has enabled AVX -->
+ <check name="get_avx_enabled"></check>
+ <flag compiler="gnu">-mavx</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>32</alignment>
+</arch>
+
+</grammar>
diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml
new file mode 100644
index 000000000..d88a1a50c
--- /dev/null
+++ b/volk/gen/machines.xml
@@ -0,0 +1,55 @@
+<grammar>
+
+<machine name="generic">
+<archs>generic orc|</archs>
+</machine>
+
+<!--
+<machine name="mmx">
+<archs>generic 32|64 mmx orc|</archs>
+</machine>
+
+<machine name="sse">
+<archs>generic 32|64| mmx| sse orc|</archs>
+</machine>
+-->
+
+<machine name="neon">
+<archs>generic neon orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="sse2">
+<archs>generic 32|64| mmx| sse sse2 orc|</archs>
+</machine>
+
+<machine name="sse3">
+<archs>generic 32|64 mmx sse sse2 sse3 orc|</archs>
+</machine>
+
+<machine name="ssse3">
+<archs>generic 32|64 mmx sse sse2 sse3 ssse3 orc|</archs>
+</machine>
+
+<machine name="sse4_a">
+<archs>generic 32|64 mmx sse sse2 sse3 sse4_a popcount orc|</archs>
+</machine>
+
+<machine name="sse4_1">
+<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 orc|</archs>
+</machine>
+
+<machine name="sse4_2">
+<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
+</machine>
+
+<machine name="altivec">
+<archs>generic altivec</archs>
+</machine>
+
+</grammar>
diff --git a/volk/gen/volk_arch_defs.py b/volk/gen/volk_arch_defs.py
new file mode 100644
index 000000000..3c75e1374
--- /dev/null
+++ b/volk/gen/volk_arch_defs.py
@@ -0,0 +1,85 @@
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+archs = list()
+arch_dict = dict()
+
+class arch_class:
+ def __init__(self, flags, checks, **kwargs):
+ for key, cast, failval in (
+ ('name', str, None),
+ ('environment', str, None),
+ ('include', str, None),
+ ('alignment', int, 1)
+ ):
+ try: setattr(self, key, cast(kwargs[key]))
+ except: setattr(self, key, failval)
+ self.checks = checks
+ assert(self.name)
+ self._flags = flags
+
+ def is_supported(self, compiler):
+ if not self._flags.keys(): return True
+ return compiler in self._flags.keys()
+
+ def get_flags(self, compiler):
+ try: return self._flags[compiler]
+ except KeyError: return list()
+
+ def __repr__(self): return self.name
+
+def register_arch(**kwargs):
+ arch = arch_class(**kwargs)
+ archs.append(arch)
+ arch_dict[arch.name] = arch
+
+########################################################################
+# register the arches
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch')
+for arch_xml in archs_xml:
+ kwargs = dict()
+ for attr in arch_xml.attributes.keys():
+ kwargs[attr] = arch_xml.attributes[attr].value
+ for node in arch_xml.childNodes:
+ try:
+ name = node.tagName
+ val = arch_xml.getElementsByTagName(name)[0].firstChild.data
+ kwargs[name] = val
+ except: pass
+ checks = list()
+ for check_xml in arch_xml.getElementsByTagName("check"):
+ name = check_xml.attributes["name"].value
+ params = list()
+ for param_xml in check_xml.getElementsByTagName("param"):
+ params.append(param_xml.firstChild.data)
+ checks.append([name, params])
+ flags = dict()
+ for flag_xml in arch_xml.getElementsByTagName("flag"):
+ name = flag_xml.attributes["compiler"].value
+ if not flags.has_key(name): flags[name] = list()
+ flags[name].append(flag_xml.firstChild.data)
+ #force kwargs keys to be of type str, not unicode for py25
+ kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
+ register_arch(flags=flags, checks=checks, **kwargs)
+
+if __name__ == '__main__':
+ print archs
diff --git a/volk/gen/volk_compile_utils.py b/volk/gen/volk_compile_utils.py
new file mode 100644
index 000000000..cf1357375
--- /dev/null
+++ b/volk/gen/volk_compile_utils.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import optparse
+import volk_arch_defs
+import volk_machine_defs
+
+def do_arch_flags_list(compiler):
+ output = list()
+ for arch in volk_arch_defs.archs:
+ if not arch.is_supported(compiler): continue
+ fields = [arch.name] + arch.get_flags(compiler)
+ output.append(','.join(fields))
+ print ';'.join(output)
+
+def do_machines_list(arch_names):
+ output = list()
+ for machine in volk_machine_defs.machines:
+ machine_arch_set = set(machine.arch_names)
+ if set(arch_names).intersection(machine_arch_set) == machine_arch_set:
+ output.append(machine.name)
+ print ';'.join(output)
+
+def do_machine_flags_list(compiler, machine_name):
+ output = list()
+ machine = volk_machine_defs.machine_dict[machine_name]
+ for arch in machine.archs:
+ output.extend(arch.get_flags(compiler))
+ print ' '.join(output)
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('--mode', type='string')
+ parser.add_option('--compiler', type='string')
+ parser.add_option('--archs', type='string')
+ parser.add_option('--machine', type='string')
+ (opts, args) = parser.parse_args()
+
+ if opts.mode == 'arch_flags': return do_arch_flags_list(opts.compiler.lower())
+ if opts.mode == 'machines': return do_machines_list(opts.archs.split(';'))
+ if opts.mode == 'machine_flags': return do_machine_flags_list(opts.compiler.lower(), opts.machine)
+
+if __name__ == '__main__': main()
diff --git a/volk/gen/volk_kernel_defs.py b/volk/gen/volk_kernel_defs.py
new file mode 100644
index 000000000..f246db0f9
--- /dev/null
+++ b/volk/gen/volk_kernel_defs.py
@@ -0,0 +1,209 @@
+#
+# Copyright 2011-2012 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import re
+import sys
+import glob
+
+########################################################################
+# Strip comments from a c/cpp file.
+# Input is code string, output is code string without comments.
+# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
+########################################################################
+def comment_remover(text):
+ def replacer(match):
+ s = match.group(0)
+ if s.startswith('/'):
+ return ""
+ else:
+ return s
+ pattern = re.compile(
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+ re.DOTALL | re.MULTILINE
+ )
+ return re.sub(pattern, replacer, text)
+
+########################################################################
+# Split code into nested sections according to ifdef preprocessor macros
+########################################################################
+def split_into_nested_ifdef_sections(code):
+ sections = list()
+ section = ''
+ header = 'text'
+ in_section_depth = 0
+ for i, line in enumerate(code.splitlines()):
+ m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+ line_is = 'normal'
+ if m:
+ p0, p1, fcn, stuff = m.groups()
+ if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
+ if fcn in ('else', 'elif'): line_is = 'else'
+ if fcn in ('endif',): line_is = 'end'
+
+ if line_is == 'if': in_section_depth += 1
+ if line_is == 'end': in_section_depth -= 1
+
+ if in_section_depth == 1 and line_is == 'if':
+ sections.append((header, section))
+ section = ''
+ header = line
+ continue
+
+ if in_section_depth == 1 and line_is == 'else':
+ sections.append((header, section))
+ section = ''
+ header = line
+ continue
+
+ if in_section_depth == 0 and line_is == 'end':
+ sections.append((header, section))
+ section = ''
+ header = 'text'
+ continue
+
+ section += line + '\n'
+
+ sections.append((header, section)) #and pack remainder into sections
+ sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
+
+ #recurse into non-text sections to fill subsections
+ for i, (header, section) in enumerate(sections):
+ if header == 'text': continue
+ sections[i] = (header, split_into_nested_ifdef_sections(section))
+
+ return sections
+
+########################################################################
+# Recursive print of sections to test code above
+########################################################################
+def print_sections(sections, indent = ' '):
+ for header, body in sections:
+ if header == 'text':
+ print indent, ('\n'+indent).join(body.splitlines())
+ continue
+ print indent.replace(' ', '-') + '>', header
+ print_sections(body, indent + ' ')
+
+########################################################################
+# Flatten a section to just body text
+########################################################################
+def flatten_section_text(sections):
+ output = ''
+ for hdr, bdy in sections:
+ if hdr != 'text': output += flatten_section_text(bdy)
+ else: output += bdy
+ return output
+
+########################################################################
+# Extract kernel info from section, represent as an implementation
+########################################################################
+class impl_class:
+ def __init__(self, kern_name, header, body):
+ #extract LV_HAVE_*
+ self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header)))
+ #extract function suffix and args
+ body = flatten_section_text(body)
+ try:
+ fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
+ body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
+ m = fcn_matcher.match(body)
+ impl_name, the_rest = m.groups()
+ self.name = impl_name.replace(kern_name+'_', '')
+ self.args = list()
+ fcn_args = the_rest.split(',')
+ for fcn_arg in fcn_args:
+ arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
+ m = arg_matcher.match(fcn_arg)
+ arg_type, arg_name = m.groups()
+ self.args.append((arg_type, arg_name))
+ except Exception as ex:
+ raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)
+
+ assert self.name
+ self.is_aligned = self.name.startswith('a_')
+
+ def __repr__(self):
+ return self.name
+
+########################################################################
+# Get sets of LV_HAVE_* from the code
+########################################################################
+def extract_lv_haves(code):
+ haves = list()
+ for line in code.splitlines():
+ if not line.strip().startswith('#'): continue
+ have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line)))
+ if have_set: haves.append(have_set)
+ return haves
+
+########################################################################
+# Represent a processing kernel, parse from file
+########################################################################
+class kernel_class:
+ def __init__(self, kernel_file):
+ self.name = os.path.splitext(os.path.basename(kernel_file))[0]
+ self.pname = self.name.replace('volk_', 'p_')
+ code = open(kernel_file, 'r').read()
+ code = comment_remover(code)
+ sections = split_into_nested_ifdef_sections(code)
+ self._impls = list()
+ for header, section in sections:
+ if 'ifndef' not in header.lower(): continue
+ for sub_hdr, body in section:
+ if 'if' not in sub_hdr.lower(): continue
+ if 'LV_HAVE_' not in sub_hdr: continue
+ self._impls.append(impl_class(
+ kern_name=self.name, header=sub_hdr, body=body,
+ ))
+ assert(self._impls)
+ self.has_dispatcher = False
+ for impl in self._impls:
+ if impl.name == 'dispatcher':
+ self._impls.remove(impl)
+ self.has_dispatcher = True
+ break
+ self.args = self._impls[0].args
+ self.arglist_types = ', '.join([a[0] for a in self.args])
+ self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
+ self.arglist_names = ', '.join([a[1] for a in self.args])
+
+ def get_impls(self, archs):
+ archs = set(archs)
+ impls = list()
+ for impl in self._impls:
+ if impl.deps.intersection(archs) == impl.deps:
+ impls.append(impl)
+ return impls
+
+ def __repr__(self):
+ return self.name
+
+########################################################################
+# Extract information from the VOLK kernels
+########################################################################
+__file__ = os.path.abspath(__file__)
+srcdir = os.path.dirname(os.path.dirname(__file__))
+kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))
+kernels = map(kernel_class, kernel_files)
+
+if __name__ == '__main__':
+ print kernels
diff --git a/volk/gen/volk_machine_defs.py b/volk/gen/volk_machine_defs.py
new file mode 100644
index 000000000..7293d4746
--- /dev/null
+++ b/volk/gen/volk_machine_defs.py
@@ -0,0 +1,74 @@
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+from volk_arch_defs import arch_dict
+
+machines = list()
+machine_dict = dict()
+
+class machine_class:
+ def __init__(self, name, archs):
+ self.name = name
+ self.archs = list()
+ self.arch_names = list()
+ for arch_name in archs:
+ if not arch_name: continue
+ arch = arch_dict[arch_name]
+ self.archs.append(arch)
+ self.arch_names.append(arch_name)
+ self.alignment = max(map(lambda a: a.alignment, self.archs))
+
+ def __repr__(self): return self.name
+
+def register_machine(name, archs):
+ for i, arch_name in enumerate(archs):
+ if '|' in arch_name: #handle special arch names with the '|'
+ for arch_sub in arch_name.split('|'):
+ if arch_sub:
+ register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:])
+ else:
+ register_machine(name, archs[:i] + archs[i+1:])
+ return
+ machine = machine_class(name=name, archs=archs)
+ machines.append(machine)
+ machine_dict[machine.name] = machine
+
+########################################################################
+# register the machines
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine')
+for machine_xml in machines_xml:
+ kwargs = dict()
+ for attr in machine_xml.attributes.keys():
+ kwargs[attr] = machine_xml.attributes[attr].value
+ for node in machine_xml.childNodes:
+ try:
+ name = node.tagName
+ val = machine_xml.getElementsByTagName(name)[0].firstChild.data
+ kwargs[name] = val
+ except: pass
+ kwargs['archs'] = kwargs['archs'].split()
+ #force kwargs keys to be of type str, not unicode for py25
+ kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
+ register_machine(**kwargs)
+
+if __name__ == '__main__':
+ print machines
diff --git a/volk/gen/volk_tmpl_utils.py b/volk/gen/volk_tmpl_utils.py
new file mode 100644
index 000000000..6c08a8213
--- /dev/null
+++ b/volk/gen/volk_tmpl_utils.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import re
+import sys
+import optparse
+import volk_arch_defs
+import volk_machine_defs
+import volk_kernel_defs
+from Cheetah import Template
+
+def __escape_pre_processor(code):
+ out = list()
+ for line in code.splitlines():
+ m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+ if m:
+ p0, p1, fcn, stuff = m.groups()
+ conly = fcn in ('include', 'define', 'ifdef', 'ifndef', 'endif', 'elif', 'pragma')
+ both = fcn in ('if', 'else')
+ istmpl = '$' in stuff
+ if 'defined' in stuff: istmpl = False
+ if conly or (both and not istmpl):
+ line = '%s\\#%s%s%s'%(p0, p1, fcn, stuff)
+ out.append(line)
+ return '\n'.join(out)
+
+def __parse_tmpl(_tmpl, **kwargs):
+ defs = {
+ 'archs': volk_arch_defs.archs,
+ 'arch_dict': volk_arch_defs.arch_dict,
+ 'machines': volk_machine_defs.machines,
+ 'machine_dict': volk_machine_defs.machine_dict,
+ 'kernels': volk_kernel_defs.kernels,
+ }
+ defs.update(kwargs)
+ _tmpl = __escape_pre_processor(_tmpl)
+ _tmpl = """
+
+/* this file was generated by volk template utils, do not edit! */
+
+""" + _tmpl
+ return str(Template.Template(_tmpl, defs))
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('--input', type='string')
+ parser.add_option('--output', type='string')
+ (opts, args) = parser.parse_args()
+
+ output = __parse_tmpl(open(opts.input).read(), args=args)
+ if opts.output: open(opts.output, 'w').write(output)
+ else: print output
+
+if __name__ == '__main__': main()
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..1f6554af8
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ static const int N_UNROLL = 4;
+
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
+
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+ for(i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
+
+ for(; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
+
+ *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h
new file mode 100644
index 000000000..6338fbdd1
--- /dev/null
+++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H
+#define INCLUDED_volk_16i_branch_4_state_8_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = 1;
+
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm5 = _mm_load_si128(p_src0);
+
+
+
+
+
+
+
+
+
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+ p_src0 += 4;
+
+
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
+
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
+
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+ _mm_store_si128(&p_target[2], xmm8);
+
+ _mm_store_si128(&p_target[3], xmm10);
+
+ p_target += 3;
+ }
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+ int i = 0;
+
+ int bound = 4;
+
+ for(; i < bound; ++i) {
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8] & scalars[2])
+ + (cntl3[i * 8] & scalars[3]);
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 1] & scalars[2])
+ + (cntl3[i * 8 + 1] & scalars[3]);
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 2] & scalars[2])
+ + (cntl3[i * 8 + 2] & scalars[3]);
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 3] & scalars[2])
+ + (cntl3[i * 8 + 3] & scalars[3]);
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 4] & scalars[2])
+ + (cntl3[i * 8 + 4] & scalars[3]);
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 5] & scalars[2])
+ + (cntl3[i * 8 + 5] & scalars[3]);
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 6] & scalars[2])
+ + (cntl3[i * 8 + 6] & scalars[3]);
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 7] & scalars[2])
+ + (cntl3[i * 8 + 7] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/
diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h
new file mode 100644
index 000000000..84548c8c5
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_a.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h
new file mode 100644
index 000000000..80608a141
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_u.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h
new file mode 100644
index 000000000..edfff8a82
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_16i_a.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16i_max_star_16i_a_H
+#define INCLUDED_volk_16i_max_star_16i_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+
+
+ __m128i *p_src0;
+
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+
+ for(i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ //xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+
+
+
+
+
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+
+
+ }
+
+ _mm_store_si128((__m128i*)cands, xmm0);
+
+ for(i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+ }
+
+ target[0] = candidate;
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short candidate = src0[0];
+ for(i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
+
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
new file mode 100644
index 000000000..c1c908425
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+
+#include <volk/volk_common.h>
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+ const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
+
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+ __m128i *p_target, *p_src0;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ for(i = 0; i < bound; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
+
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+ _mm_store_si128(p_target, xmm3);
+
+ p_target += 1;
+
+ }
+
+ for(i = 0; i < intermediate; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+
+ }
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+ target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+
+ for(i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+ }
+
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
new file mode 100644
index 000000000..47e3cbf9c
--- /dev/null
+++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+#ifdef LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+ short* p_permute_indexes = permute_indexes;
+
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
+
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_permute_indexes += 8;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
+
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ xmm5 = _mm_load_si128(p_cntl3);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_cntl0 += 1;
+
+ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ p_cntl1 += 1;
+ p_cntl2 += 1;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ p_cntl3 += 1;
+
+ _mm_store_si128(p_target, xmm0);
+
+ p_target += 1;
+ }
+
+
+
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
new file mode 100644
index 000000000..7108ff659
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..4ce8e8f35
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
new file mode 100644
index 000000000..0d8498553
--- /dev/null
+++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+
+#ifdef LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+
+
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*) target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+
+
+
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ while(bound_copy > 0) {
+
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
+
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
+
+
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+
+
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+
+
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
+
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
+
+ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
+
+
+
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
+
+
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+
+ }
+
+
+ /*asm volatile
+ (
+ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
+
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
+
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
+
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
+
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
+
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
+
+ "pand %%xmm1, %%xmm6\n\t"
+
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
+
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
+
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+
+ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+ return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
new file mode 100644
index 000000000..5560b92d9
--- /dev/null
+++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+
+#ifdef LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*asm volatile
+ (
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+
+ */
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
new file mode 100644
index 000000000..f8aa30874
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
new file mode 100644
index 000000000..bac1f2e4b
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h
new file mode 100644
index 000000000..cd2fabb52
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h
new file mode 100644
index 000000000..317075e85
--- /dev/null
+++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
+#define INCLUDED_volk_16ic_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = 32768.0;
+ for(number = 0; number < num_points; number++){
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
new file mode 100644
index 000000000..1300395ff
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
new file mode 100644
index 000000000..5e2d82b94
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+ for(;number < quarterPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar = 1.0/scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
new file mode 100644
index 000000000..d20eea1a7
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
@@ -0,0 +1,180 @@
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+
+ __m128 cplxValue1, cplxValue2, result, re, im;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(re, invScalar);
+ cplxValue2 = _mm_mul_ps(im, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+
+
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ float real = ( (float) (*complexVectorPtr++)) * invScalar;
+ float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h
new file mode 100644
index 000000000..fc3eb5fa7
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_a.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_16u_byteswap_a_H
+#define INCLUDED_volk_16u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){
+ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a_H */
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
new file mode 100644
index 000000000..8ef627a62
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_u.h
@@ -0,0 +1,63 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h
new file mode 100644
index 000000000..78364d0a0
--- /dev/null
+++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
+#define INCLUDED_volk_32f_accumulator_s32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h
new file mode 100644
index 000000000..2c469ac42
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_a.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
new file mode 100644
index 000000000..10d8a4f6c
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_u.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h
new file mode 100644
index 000000000..b9ca1dd3e
--- /dev/null
+++ b/volk/include/volk/volk_32f_index_max_16u_a.h
@@ -0,0 +1,149 @@
+#ifndef INCLUDED_volk_32f_index_max_16u_a_H
+#define INCLUDED_volk_32f_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ float max = src0[0];
+ unsigned int index = 0;
+
+ unsigned int i = 1;
+
+ for(; i < num_points; ++i) {
+
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
new file mode 100644
index 000000000..43713f8b5
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_noints The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int quarterPoints = (num_points-1) / 4;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m128 upperBound = _mm_set_ps1(bound);
+ __m128 lowerBound = _mm_set_ps1(-bound);
+ __m128 next3old1;
+ __m128 next4;
+ __m128 boundAdjust;
+ __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+ __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+ // Do the first 4 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < quarterPoints; number++) {
+ // Load data
+ next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+ next4 = _mm_load_ps(inPtr);
+ inPtr += 4;
+ // Subtract and store:
+ next3old1 = _mm_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm_cmplt_ps(next3old1, lowerBound);
+ next4 = _mm_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm_add_ps(next3old1, boundAdjust);
+ _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+ outPtr += 4;
+ }
+
+ for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_points The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 0;
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+
+ // Do the first 1 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+
+ for (number = 1; number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
new file mode 100644
index 000000000..db61e359d
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
@@ -0,0 +1,168 @@
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaning count
+ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(realDataPoints[number] <= meanAmplitude){
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if(validBinCount > 0.0){
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ }
+ else{
+ localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for(number = 0; number < num_points; number++){
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for(number = 0; number < num_points; number++){
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
+
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
new file mode 100644
index 000000000..9df4946f2
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -0,0 +1,150 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
new file mode 100644
index 000000000..56e42c9bd
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -0,0 +1,152 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
new file mode 100644
index 000000000..38e6b2e74
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
@@ -0,0 +1,189 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
new file mode 100644
index 000000000..ee15edb46
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
@@ -0,0 +1,142 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
new file mode 100644
index 000000000..800017d5d
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
new file mode 100644
index 000000000..870e9419b
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
@@ -0,0 +1,157 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
new file mode 100644
index 000000000..99b8e68c5
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
new file mode 100644
index 000000000..b3fae9b05
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/include/volk/volk_32f_s32f_normalize_a.h
new file mode 100644
index 000000000..f5fd0d1db
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_normalize_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_s32f_normalize_a_H
+#define INCLUDED_volk_32f_s32f_normalize_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
+ \param vecBuffer The buffer of values to be vectorized
+ \param num_points The number of values in vecBuffer
+ \param scalar The scale value to be applied to each buffer value
+*/
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m128 vecScalar = _mm_set_ps1(invScalar);
+
+ __m128 input1;
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+
+ input1 = _mm_load_ps(inputPtr);
+
+ input1 = _mm_mul_ps(input1, vecScalar);
+
+ _mm_store_ps(inputPtr, input1);
+
+ inputPtr += 4;
+ }
+
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h
new file mode 100644
index 000000000..633ad14b0
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
+#define INCLUDED_volk_32f_s32f_power_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
new file mode 100644
index 000000000..98401b2d4
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
@@ -0,0 +1,145 @@
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h
new file mode 100644
index 000000000..d9b16fc0f
--- /dev/null
+++ b/volk/include/volk/volk_32f_sqrt_32f_a.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
+#define INCLUDED_volk_32f_sqrt_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_sqrt_ps(aVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){
+ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
new file mode 100644
index 000000000..7de32f7b1
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
@@ -0,0 +1,170 @@
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h
new file mode 100644
index 000000000..51e63e54d
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
+#define INCLUDED_volk_32f_x2_add_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
new file mode 100644
index 000000000..52e8286bc
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_u.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h
new file mode 100644
index 000000000..7b60fb22e
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
+#define INCLUDED_volk_32f_x2_divide_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_div_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
new file mode 100644
index 000000000..067c33ad8
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -0,0 +1,290 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
new file mode 100644
index 000000000..b24e8b1f7
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -0,0 +1,290 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
new file mode 100644
index 000000000..52d80b6bb
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h
new file mode 100644
index 000000000..79f2d04b5
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_max_32f_a.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_max_32f_a_H
+#define INCLUDED_volk_32f_x2_max_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_max_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h
new file mode 100644
index 000000000..42cac0833
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_min_32f_a.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_min_32f_a_H
+#define INCLUDED_volk_32f_x2_min_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_min_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h
new file mode 100644
index 000000000..340e05165
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
new file mode 100644
index 000000000..bfb896d60
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
new file mode 100644
index 000000000..10fc267dc
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h
new file mode 100644
index 000000000..e2b8be797
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
new file mode 100644
index 000000000..3c530628c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+
+ xmm0 = _mm_load1_ps(&center_point_array[0]);
+ xmm6 = _mm_load1_ps(&center_point_array[1]);
+ xmm7 = _mm_load1_ps(&center_point_array[2]);
+ xmm8 = _mm_load1_ps(&center_point_array[3]);
+ //xmm11 = _mm_load1_ps(&center_point_array[4]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 2) & 3;
+ int i = 0;
+
+ for(; i < bound; ++i) {
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+ //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+ //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+
+ //xmm9 = _mm_add_ps(xmm12, xmm9);
+ }
+
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+ _mm_store_ss(&result, xmm4);
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
+ //center_point_array[4] * fith);
+ }
+
+ result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+ target[0] = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 2; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ //center_point_array[4] * fith);
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth) +
+ //center_point_array[4] * fith) +
+ (center_point_array[4]), i);
+ */
+ }
+
+ result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+
+
+ *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
new file mode 100644
index 000000000..109b787e8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ *realpt = 0;
+ *imagpt = 0;
+
+ for(number = 0; number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+
+ x0Val = _mm_load_ps(bPtr);
+ x1Val = _mm_load_ps(bPtr);
+ x2Val = _mm_load_ps(bPtr+4);
+ x3Val = _mm_load_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
new file mode 100644
index 000000000..28d584bf2
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal1 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ aVal2 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ bVal = _mm_load_ps(bPtr);
+ bPtr += 4;
+
+ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+ cVal = _mm_mul_ps(aVal1, bVal1);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+ cPtr += 2;
+
+ cVal = _mm_mul_ps(aVal2, bVal2);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr);
+ bPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
new file mode 100644
index 000000000..919280d51
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_store_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
new file mode 100644
index 000000000..e0d79ea7b
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
new file mode 100644
index 000000000..4106f3851
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
new file mode 100644
index 000000000..77566e671
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
new file mode 100644
index 000000000..c88809beb
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in q1q2q3q4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(qBufferPtr, iValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ for(number = 0; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
new file mode 100644
index 000000000..0d6c6b7af
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
new file mode 100644
index 000000000..1e346baca
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h
new file mode 100644
index 000000000..842a6a042
--- /dev/null
+++ b/volk/include/volk/volk_32fc_index_max_16u_a.h
@@ -0,0 +1,215 @@
+#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
+#define INCLUDED_volk_32fc_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+
+static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+
+
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+
+
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+
+ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+ xmm9 = xmm8 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
+;
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+
+ src0 += 4;
+
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+
+ }
+
+
+ for(i = 0; i < leftovers0; ++i) {
+
+
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ }
+
+
+
+
+ for(i = 0; i < leftovers1; ++i) {
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+
+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ }
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+
+
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ unsigned int l0 = g0 ^ 1;
+
+ unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ unsigned int l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
+
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+ float sq_dist = 0.0;
+ float max = 0.0;
+ unsigned int index = 0;
+
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+
+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+
+
+ }
+ target[0] = index;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h
new file mode 100644
index 000000000..efb84a904
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h
@@ -0,0 +1,132 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
+static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h
new file mode 100644
index 000000000..c8b3f0a08
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h
@@ -0,0 +1,118 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
new file mode 100644
index 000000000..d3ac9717a
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
@@ -0,0 +1,114 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
new file mode 100644
index 000000000..53a4e68eb
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
@@ -0,0 +1,114 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
new file mode 100644
index 000000000..d86bd63c1
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_blendv_ps(correctVector, phase, keepMask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 mask;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_and_ps(phase, keepMask);
+ mask = _mm_andnot_ps(keepMask, correctVector);
+ phase = _mm_or_ps(phase, mask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The vector where the results will be stored.
+ \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+ float* outPtr = outputVector;
+ const float* inPtr = (float*)inputVector;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number;
+ for ( number = 0; number < num_points; number++) {
+ const float real = *inPtr++;
+ const float imag = *inPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h
new file mode 100644
index 000000000..1c17fb70c
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ iValue = _mm_mul_ps(iValue, vScalar);
+
+ _mm_store_ps(floatBuffer, iValue);
+ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ iBufferPtr = &iBuffer[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h
new file mode 100644
index 000000000..38fd609d3
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h
@@ -0,0 +1,159 @@
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
new file mode 100644
index 000000000..3106edbef
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H
+#define INCLUDED_volk_32fc_s32f_power_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+//! raise a complex float to a real float power
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power){
+ const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
+ const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
+ return mag*lv_cmake(cosf(arg), sinf(arg));
+}
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+
+ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ cplxValue2 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ // Convert to polar coordinates
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+ magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+
+ // Now calculate the power of the polar coordinate data
+ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+ // Convert back to cartesian coordinates
+ iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+ qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+ _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+
+ cPtr += 2;
+
+ _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
new file mode 100644
index 000000000..30a77dbc1
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Calculates the log10 power value for each input point
+ \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the log10 power value for each input point
+ \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided agains all the input values before the power is calculated
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
new file mode 100644
index 000000000..27f755351
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Calculates the log10 power value divided by the RBW for each input point
+ \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param rbw The resolution bandwith of the fft spectrum
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invRBW = _mm_set_ps1(iRBW);
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Divide by the rbw
+ power = _mm_mul_ps(power, invRBW);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the log10 power value divided by the RBW for each input point
+ \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param rbw The resolution bandwith of the fft spectrum
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ unsigned int point;
+ const float invRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
new file mode 100644
index 000000000..f206c5e87
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -0,0 +1,91 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
new file mode 100644
index 000000000..5c7d15b02
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -0,0 +1,87 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
new file mode 100644
index 000000000..eee9f0064
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
@@ -0,0 +1,74 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points);
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+
+
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
new file mode 100644
index 000000000..51b6041ec
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
@@ -0,0 +1,257 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define ROTATOR_RELOAD 512
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ unsigned int i = 0;
+ int j = 0;
+ for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+ (*phase) /= abs((*phase));
+ }
+ for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+ for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%2; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+
+ for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm256_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
new file mode 100644
index 000000000..e3dedf2fc
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
@@ -0,0 +1,345 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+
+
+
+ asm volatile
+ (
+ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %[conjugator], %%r9\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movaps 0(%%r9), %%xmm8\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " xorps %%xmm8, %%xmm3\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ int getem = num_bytes % 16;
+
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+
+ }
+
+ return;
+}
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ int bound = num_bytes >> 4;
+ int leftovers = num_bytes % 16;
+
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " #movl 12(%%ebp), %%eax # input\n\t"
+ " #movl 16(%%ebp), %%edx # taps\n\t"
+ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%[eax]), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%[edx]), %%xmm2\n\t"
+ " movl %[ecx], (%[out])\n\t"
+ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+ " xorps %%xmm1, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%[eax]), %%xmmA\n\t"
+ "# movaps (%[edx]), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%[edx]), %%xmm3\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " xorps %%xmm1, %%xmm3\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " movaps 16(%[eax]), %%xmm1\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%[eax]), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %[eax]\n\t"
+ " movaps 32(%[edx]), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " xorps %%xmm1, %%xmm2\n\t"
+ " addl $32, %[edx]\n\t"
+ ".%=L1_test:\n\t"
+ " decl %[ecx]\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %[ecx]\n\t"
+ " andl $1, %[ecx]\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " #movl 8(%%ebp), %[eax] \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%[out])\n\t"
+ " movss (%[out]), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %[eax] # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+ );
+
+
+
+
+ printf("%d, %d\n", leftovers, bound);
+
+ for(; leftovers > 0; leftovers -= 8) {
+
+
+ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+
+ }
+
+ return;
+
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
new file mode 100644
index 000000000..e7493413f
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -0,0 +1,145 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include<volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <mmintrin.h>
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ // Variable never used?
+ //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ union HalfMask {
+ uint32_t intRep[4];
+ __m128 vec;
+ } halfMask;
+
+ union NegMask {
+ int intRep[4];
+ __m128 vec;
+ } negMask;
+
+ unsigned int offset = 0;
+ float Rsum=0, Isum=0;
+ float Im,Re;
+
+ __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
+ __m128 zv = {0,0,0,0};
+
+ halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
+ halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
+
+ negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
+ negMask.intRep[1] = negMask.intRep[3] = 0;
+
+ // main loop
+ while(num_bytes >= 4*sizeof(float)){
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_mul_ps(in1, in2);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_mul_ps(in1, fehg);
+ Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ num_bytes -= 4*sizeof(float);
+ offset += 2;
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ // handle the last complex case ...
+ if(num_bytes > 0){
+
+ if(num_bytes != 4){
+ // bad things are happening
+ }
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
+ Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ result[0] = lv_cmake(Rsum,Isum);
+ return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+
+
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
new file mode 100644
index 000000000..caef3e6f0
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
@@ -0,0 +1,440 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ for(i = 0; i < isodd; ++i) {
+ *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(((num_bytes >> 3) & 1)) {
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+ }
+
+ return;
+
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes);
+
+#if 0
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " movl 12(%%ebp), %%eax # input\n\t"
+ " movl 16(%%ebp), %%edx # taps\n\t"
+ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%eax), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%edx), %%xmm2\n\t"
+ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%eax), %%xmmA\n\t"
+ "# movaps (%%edx), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%eax), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%edx), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%eax), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %%eax\n\t"
+ " movaps 32(%%edx), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " addl $32, %%edx\n\t"
+ ".%=L1_test:\n\t"
+ " decl %%ecx\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %%ecx\n\t"
+ " andl $1, %%ecx\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " movl 8(%%ebp), %%eax \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%%eax)\n\t"
+ " movss (%%eax), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %%eax # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ :
+ : "eax", "ecx", "edx"
+ );
+
+
+ int getem = num_bytes % 16;
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+
+ }
+
+ return;
+#endif
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_bytes >> 4;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(((num_bytes >> 3) & 1) != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ int i = 0;
+
+ int bound = (num_bytes >> 5);
+ int leftovers = (num_bytes & 24) >> 3;
+
+ real0 = _mm_sub_ps(real0, real0);
+ real1 = _mm_sub_ps(real1, real1);
+ im0 = _mm_sub_ps(im0, im0);
+ im1 = _mm_sub_ps(im1, im1);
+
+ for(; i < bound; ++i) {
+
+
+ xmm0 = _mm_load_ps(p_input);
+ xmm1 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_load_ps(p_input);
+ xmm3 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
new file mode 100644
index 000000000..7c0dba7fd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
@@ -0,0 +1,116 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[num_points - 1] * taps[num_points - 1];
+
+ }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(num_points % 1 != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
new file mode 100644
index 000000000..f79ddb59b
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
@@ -0,0 +1,93 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
new file mode 100644
index 000000000..a998d6184
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
new file mode 100644
index 000000000..2755192e9
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
new file mode 100644
index 000000000..09dcd635b
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
new file mode 100644
index 000000000..75eb9173d
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ lv_32fc_t diff;
+ memset(&diff, 0x0, 2*sizeof(float));
+
+ float sq_dist = 0.0;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm8 = _mm_load1_ps(&scalar);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+
+ for(; i < bound - 1; ++i) {
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+
+ for(i = 0; i < leftovers0; ++i) {
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
new file mode 100644
index 000000000..b819eaffd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
@@ -0,0 +1,112 @@
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+
+ for(; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ for(i = 0; i < leftovers0; ++i) {
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
new file mode 100644
index 000000000..8f4123d71
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_store_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..b3a8ab201
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h
new file mode 100644
index 000000000..e5330847b
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_and_32i_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_and_32i_a_H
+#define INCLUDED_volk_32i_x2_and_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_and_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] & bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h
new file mode 100644
index 000000000..24045894c
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_or_32i_a.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
+#define INCLUDED_volk_32i_x2_or_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_or_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] | bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h
new file mode 100644
index 000000000..71ae027d3
--- /dev/null
+++ b/volk/include/volk/volk_32u_byteswap_a.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32u_byteswap_a_H
+#define INCLUDED_volk_32u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_a_H */
diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h
new file mode 100644
index 000000000..e27d1f03d
--- /dev/null
+++ b/volk/include/volk/volk_32u_byteswap_u.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/include/volk/volk_32u_popcnt_a.h
new file mode 100644
index 000000000..b72d605c6
--- /dev/null
+++ b/volk/include/volk/volk_32u_popcnt_a.h
@@ -0,0 +1,36 @@
+#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+#define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) {
+
+ // This is faster than a lookup table
+ uint32_t retVal = value;
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+ *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) {
+ *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h
new file mode 100644
index 000000000..11d51702b
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_a.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
new file mode 100644
index 000000000..31dc5b5fe
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_u.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h
new file mode 100644
index 000000000..33aae6d10
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_max_64f_a.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_max_64f_a_H
+#define INCLUDED_volk_64f_x2_max_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_max_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h
new file mode 100644
index 000000000..25d8b4c98
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_min_64f_a.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_min_64f_a_H
+#define INCLUDED_volk_64f_x2_min_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_min_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */
diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h
new file mode 100644
index 000000000..3d1d87623
--- /dev/null
+++ b/volk/include/volk/volk_64u_byteswap_a.h
@@ -0,0 +1,88 @@
+#ifndef INCLUDED_volk_64u_byteswap_a_H
+#define INCLUDED_volk_64u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_a_H */
diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h
new file mode 100644
index 000000000..41a4a3130
--- /dev/null
+++ b/volk/include/volk/volk_64u_byteswap_u.h
@@ -0,0 +1,88 @@
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h
new file mode 100644
index 000000000..5e68ed208
--- /dev/null
+++ b/volk/include/volk/volk_64u_popcnt_a.h
@@ -0,0 +1,52 @@
+#ifndef INCLUDED_volk_64u_popcnt_a_H
+#define INCLUDED_volk_64u_popcnt_a_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) {
+
+ //const uint32_t* valueVector = (const uint32_t*)&value;
+
+ // This is faster than a lookup table
+ //uint32_t retVal = valueVector[0];
+ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFF);
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ uint64_t retVal64 = retVal;
+
+ //retVal = valueVector[1];
+ retVal = (uint32_t)((value & 0xFFFFFFFF00000000) >> 31);
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ retVal64 += retVal;
+
+ *ret = retVal64;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) {
+ *ret = _mm_popcnt_u64(value);
+
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/include/volk/volk_8i_convert_16i_a.h
new file mode 100644
index 000000000..9104f90cb
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_a.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_volk_8i_convert_16i_a_H
+#define INCLUDED_volk_8i_convert_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h
new file mode 100644
index 000000000..7d7104f52
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_u.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h
new file mode 100644
index 000000000..02a7f356e
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..8bb2c0d1a
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1( iScalar );
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h
new file mode 100644
index 000000000..8f13da32f
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i complexVal, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+ iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+ qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+ qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h
new file mode 100644
index 000000000..d26b3d0d0
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i complexVal, outputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ outputVal = _mm_cvtepi8_epi16(complexVal);
+ outputVal = _mm_slli_epi16(outputVal, 7);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h
new file mode 100644
index 000000000..21efed83e
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, outputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+ outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
new file mode 100644
index 000000000..d82da59fb
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
@@ -0,0 +1,165 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue, qFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 4;
+
+ qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int8_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
new file mode 100644
index 000000000..b2c15d3a3
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+
+ complexVal = _mm_srli_si128(complexVal, 4);
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
new file mode 100644
index 000000000..f85fdb999
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
@@ -0,0 +1,101 @@
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ int16_t* c16Ptr = (int16_t*)cVector;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number =0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
new file mode 100644
index 000000000..4b16171ce
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ __m128 ret;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ a += 4;
+ b += 4;
+ }
+
+ number = quarterPoints * 4;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* cPtr = (float*)cVector;
+ const float invScalar = 1.0 / scalar;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number = 0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cPtr++ = (lv_creal(temp) * invScalar);
+ *cPtr++ = (lv_cimag(temp) * invScalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h
new file mode 100644
index 000000000..38263d5f7
--- /dev/null
+++ b/volk/include/volk/volk_common.h
@@ -0,0 +1,96 @@
+#ifndef INCLUDED_LIBVOLK_COMMON_H
+#define INCLUDED_LIBVOLK_COMMON_H
+
+////////////////////////////////////////////////////////////////////////
+// Cross-platform attribute macros
+////////////////////////////////////////////////////////////////////////
+#if defined __GNUC__
+# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+# define __VOLK_ATTR_UNUSED __attribute__((unused))
+# define __VOLK_ATTR_INLINE __attribute__((always_inline))
+# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+# if __GNUC__ >= 4
+# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+# else
+# define __VOLK_ATTR_EXPORT
+# define __VOLK_ATTR_IMPORT
+# endif
+#elif _MSC_VER
+# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+# define __VOLK_ATTR_UNUSED
+# define __VOLK_ATTR_INLINE __forceinline
+# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+# define __VOLK_ATTR_EXPORT __declspec(dllexport)
+# define __VOLK_ATTR_IMPORT __declspec(dllimport)
+#else
+# define __VOLK_ATTR_ALIGNED(x)
+# define __VOLK_ATTR_UNUSED
+# define __VOLK_ATTR_INLINE
+# define __VOLK_ATTR_DEPRECATED
+# define __VOLK_ATTR_EXPORT
+# define __VOLK_ATTR_IMPORT
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Ignore annoying warnings in MSVC
+////////////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
+# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// C-linkage declaration macros
+// FIXME: due to the usage of complex.h, require gcc for c-linkage
+////////////////////////////////////////////////////////////////////////
+#if defined(__cplusplus) && (__GNUC__)
+# define __VOLK_DECL_BEGIN extern "C" {
+# define __VOLK_DECL_END }
+#else
+# define __VOLK_DECL_BEGIN
+# define __VOLK_DECL_END
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Define VOLK_API for library symbols
+// http://gcc.gnu.org/wiki/Visibility
+////////////////////////////////////////////////////////////////////////
+#ifdef volk_EXPORTS
+# define VOLK_API __VOLK_ATTR_EXPORT
+#else
+# define VOLK_API __VOLK_ATTR_IMPORT
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// The bit128 union used by some
+////////////////////////////////////////////////////////////////////////
+#include <inttypes.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#endif
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+union bit128{
+ uint16_t i16[8];
+ uint32_t i[4];
+ float f[4];
+ double d[2];
+
+ #ifdef LV_HAVE_SSE
+ __m128 float_vec;
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+ __m128i int_vec;
+ __m128d double_vec;
+ #endif
+};
+
+#define bit128_p(x) ((union bit128 *)(x))
+
+#endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h
new file mode 100644
index 000000000..5bd925044
--- /dev/null
+++ b/volk/include/volk/volk_complex.h
@@ -0,0 +1,86 @@
+#ifndef INCLUDE_VOLK_COMPLEX_H
+#define INCLUDE_VOLK_COMPLEX_H
+
+/*!
+ * \brief Provide typedefs and operators for all complex types in C and C++.
+ *
+ * The typedefs encompass all signed integer and floating point types.
+ * Each operator function is intended to work across all data types.
+ * Under C++, these operators are defined as inline templates.
+ * Under C, these operators are defined as preprocessor macros.
+ * The use of macros makes the operators agnostic to the type.
+ *
+ * The following operator functions are defined:
+ * - lv_cmake - make a complex type from components
+ * - lv_creal - get the real part of the complex number
+ * - lv_cimag - get the imaginary part of the complex number
+ * - lv_conj - take the conjugate of the complex number
+ */
+
+#ifdef __cplusplus
+
+#include <complex>
+#include <stdint.h>
+
+typedef std::complex<int8_t> lv_8sc_t;
+typedef std::complex<int16_t> lv_16sc_t;
+typedef std::complex<int32_t> lv_32sc_t;
+typedef std::complex<int64_t> lv_64sc_t;
+typedef std::complex<float> lv_32fc_t;
+typedef std::complex<double> lv_64fc_t;
+
+template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
+ return std::complex<T>(r, i);
+}
+
+template <typename T> inline typename T::value_type lv_creal(const T &x){
+ return x.real();
+}
+
+template <typename T> inline typename T::value_type lv_cimag(const T &x){
+ return x.imag();
+}
+
+template <typename T> inline T lv_conj(const T &x){
+ return std::conj(x);
+}
+
+#else /* __cplusplus */
+
+#include <complex.h>
+
+typedef char complex lv_8sc_t;
+typedef short complex lv_16sc_t;
+typedef long complex lv_32sc_t;
+typedef long long complex lv_64sc_t;
+typedef float complex lv_32fc_t;
+typedef double complex lv_64fc_t;
+
+#define lv_cmake(r, i) ((r) + _Complex_I*(i))
+
+// When GNUC is available, use the complex extensions.
+// The extensions always return the correct value type.
+// http://gcc.gnu.org/onlinedocs/gcc/Complex.html
+#ifdef __GNUC__
+
+#define lv_creal(x) (__real__(x))
+
+#define lv_cimag(x) (__imag__(x))
+
+#define lv_conj(x) (~(x))
+
+// When not available, use the c99 complex function family,
+// which always returns double regardless of the input type.
+#else /* __GNUC__ */
+
+#define lv_creal(x) (creal(x))
+
+#define lv_cimag(x) (cimag(x))
+
+#define lv_conj(x) (conj(x))
+
+#endif /* __GNUC__ */
+
+#endif /* __cplusplus */
+
+#endif /* INCLUDE_VOLK_COMPLEX_H */
diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h
new file mode 100644
index 000000000..690e5f99f
--- /dev/null
+++ b/volk/include/volk/volk_prefs.h
@@ -0,0 +1,28 @@
+#ifndef INCLUDED_VOLK_PREFS_H
+#define INCLUDED_VOLK_PREFS_H
+
+#include <volk/volk_common.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_arch_pref
+{
+ char name[128]; //name of the kernel
+ char impl_a[128]; //best aligned impl
+ char impl_u[128]; //best unaligned impl
+} volk_arch_pref_t;
+
+////////////////////////////////////////////////////////////////////////
+// get path to volk_config profiling info
+////////////////////////////////////////////////////////////////////////
+VOLK_API void volk_get_config_path(char *);
+
+////////////////////////////////////////////////////////////////////////
+// load prefs into global prefs struct
+////////////////////////////////////////////////////////////////////////
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
+
+__VOLK_DECL_END
+
+#endif //INCLUDED_VOLK_PREFS_H
diff --git a/volk/kernels/README.txt b/volk/kernels/README.txt
new file mode 100644
index 000000000..5dd7434b5
--- /dev/null
+++ b/volk/kernels/README.txt
@@ -0,0 +1,67 @@
+########################################################################
+# How to create custom kernel dispatchers
+########################################################################
+A kernel dispatcher is kernel implementation that calls other kernel implementations.
+By default, a dispatcher is generated by the build system for every kernel such that:
+ * the best aligned implemention is called when all pointer arguments are aligned,
+ * and otherwise the best unaligned implementation is called.
+
+The author of a VOLK kernel may create a custom dispatcher,
+to be called in place of the automatically generated one.
+A custom dispatcher may be useful to handle head and tail cases,
+or to implement different alignment and bounds checking logic.
+
+########################################################################
+# Code for an example dispatcher w/ tail case
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%4;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
+ {
+ volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
+ }
+
+ volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
+}
+
+#endif //LV_HAVE_DISPATCHER
+
+########################################################################
+# Code for an example dispatcher w/ tail case and accumulator
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%16;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
+ {
+ volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
+ }
+
+ float result_tail = 0;
+ volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
+
+ *result += result_tail;
+}
+
+#endif //LV_HAVE_DISPATCHER
diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
new file mode 100644
index 000000000..8bc1569f6
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ static const int N_UNROLL = 4;
+
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
+
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+ for(i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
+
+ for(; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
+
+ *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_16i_branch_4_state_8.h b/volk/kernels/volk/volk_16i_branch_4_state_8.h
new file mode 100644
index 000000000..cdfbc7ba1
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_branch_4_state_8.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H
+#define INCLUDED_volk_16i_branch_4_state_8_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = 1;
+
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm5 = _mm_load_si128(p_src0);
+
+
+
+
+
+
+
+
+
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+ p_src0 += 4;
+
+
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
+
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
+
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+ _mm_store_si128(&p_target[2], xmm8);
+
+ _mm_store_si128(&p_target[3], xmm10);
+
+ p_target += 3;
+ }
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+ int i = 0;
+
+ int bound = 4;
+
+ for(; i < bound; ++i) {
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8] & scalars[2])
+ + (cntl3[i * 8] & scalars[3]);
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 1] & scalars[2])
+ + (cntl3[i * 8 + 1] & scalars[3]);
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 2] & scalars[2])
+ + (cntl3[i * 8 + 2] & scalars[3]);
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 3] & scalars[2])
+ + (cntl3[i * 8 + 3] & scalars[3]);
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 4] & scalars[2])
+ + (cntl3[i * 8 + 4] & scalars[3]);
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 5] & scalars[2])
+ + (cntl3[i * 8 + 5] & scalars[3]);
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 6] & scalars[2])
+ + (cntl3[i * 8 + 6] & scalars[3]);
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 7] & scalars[2])
+ + (cntl3[i * 8 + 7] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/
diff --git a/volk/kernels/volk/volk_16i_convert_8i.h b/volk/kernels/volk/volk_16i_convert_8i.h
new file mode 100644
index 000000000..3789b2e4a
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_convert_8i.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/kernels/volk/volk_16i_max_star_16i.h b/volk/kernels/volk/volk_16i_max_star_16i.h
new file mode 100644
index 000000000..c67351c5f
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_max_star_16i.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_volk_16i_max_star_16i_a_H
+#define INCLUDED_volk_16i_max_star_16i_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+
+
+ __m128i *p_src0;
+
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+
+ for(i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ //xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+
+
+
+
+
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+
+
+ }
+
+ _mm_store_si128((__m128i*)cands, xmm0);
+
+ for(i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+ }
+
+ target[0] = candidate;
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short candidate = src0[0];
+ for(i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
+
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/
diff --git a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
new file mode 100644
index 000000000..ef88ec094
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+
+#include <volk/volk_common.h>
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+ const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
+
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+ __m128i *p_target, *p_src0;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+
+ for(i = 0; i < bound; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
+
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+ _mm_store_si128(p_target, xmm3);
+
+ p_target += 1;
+
+ }
+
+ for(i = 0; i < intermediate; ++i) {
+
+ xmm0 = _mm_load_si128(p_src0);
+
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+
+ }
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+ target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
+
+
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+
+ for(i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+ }
+
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
diff --git a/volk/kernels/volk/volk_16i_permute_and_scalar_add.h b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
new file mode 100644
index 000000000..7a01d172a
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -0,0 +1,142 @@
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+#ifdef LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+ short* p_permute_indexes = permute_indexes;
+
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
+
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_permute_indexes += 8;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
+
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ xmm5 = _mm_load_si128(p_cntl3);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_cntl0 += 1;
+
+ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ p_cntl1 += 1;
+ p_cntl2 += 1;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ p_cntl3 += 1;
+
+ _mm_store_si128(p_target, xmm0);
+
+ p_target += 1;
+ }
+
+
+
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
new file mode 100644
index 000000000..a810a601a
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -0,0 +1,241 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
new file mode 100644
index 000000000..56b2cc07a
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -0,0 +1,192 @@
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+
+#ifdef LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*) target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+
+
+
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ while(bound_copy > 0) {
+
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
+
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
+
+
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+
+
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+
+
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
+
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
+
+ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
+
+
+
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
+
+
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+
+ }
+
+
+ /*asm volatile
+ (
+ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
+
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
+
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
+
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
+
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
+
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
+
+ "pand %%xmm1, %%xmm6\n\t"
+
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
+
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
+
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+
+ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+ return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
diff --git a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
new file mode 100644
index 000000000..9b6d19fd6
--- /dev/null
+++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+
+
+
+#ifdef LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*asm volatile
+ (
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+
+ */
+
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
diff --git a/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
new file mode 100644
index 000000000..9ce801264
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
diff --git a/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
new file mode 100644
index 000000000..f6eccd77e
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
diff --git a/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
new file mode 100644
index 000000000..f3d0c8352
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
diff --git a/volk/kernels/volk/volk_16ic_magnitude_16i.h b/volk/kernels/volk/volk_16ic_magnitude_16i.h
new file mode 100644
index 000000000..b33306a12
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_magnitude_16i.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
+#define INCLUDED_volk_16ic_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = 32768.0;
+ for(number = 0; number < num_points; number++){
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+ volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
diff --git a/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
new file mode 100644
index 000000000..55243b4aa
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex data values to be deinterleaved
+ */
+extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
diff --git a/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
new file mode 100644
index 000000000..57d078a59
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+ for(;number < quarterPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar = 1.0/scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 16 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
diff --git a/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
new file mode 100644
index 000000000..27901cb9a
--- /dev/null
+++ b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -0,0 +1,180 @@
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+
+ __m128 cplxValue1, cplxValue2, result, re, im;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(re, invScalar);
+ cplxValue2 = _mm_mul_ps(im, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+
+
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ float real = ( (float) (*complexVectorPtr++)) * invScalar;
+ float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param scalar The data value to be divided against each input data value of the input complex vector
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h
new file mode 100644
index 000000000..57f200899
--- /dev/null
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
+#ifndef INCLUDED_volk_16u_byteswap_a_H
+#define INCLUDED_volk_16u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
+ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a_H */
diff --git a/volk/kernels/volk/volk_32f_accumulator_s32f.h b/volk/kernels/volk/volk_32f_accumulator_s32f.h
new file mode 100644
index 000000000..a67d10f9b
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_accumulator_s32f.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
+#define INCLUDED_volk_32f_accumulator_s32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_convert_64f.h b/volk/kernels/volk/volk_32f_convert_64f.h
new file mode 100644
index 000000000..2f036955d
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_convert_64f.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/kernels/volk/volk_32f_index_max_16u.h b/volk/kernels/volk/volk_32f_index_max_16u.h
new file mode 100644
index 000000000..dd1aed245
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_index_max_16u.h
@@ -0,0 +1,149 @@
+#ifndef INCLUDED_volk_32f_index_max_16u_a_H
+#define INCLUDED_volk_32f_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ float max = src0[0];
+ unsigned int index = 0;
+
+ unsigned int i = 1;
+
+ for(; i < num_points; ++i) {
+
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
diff --git a/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
new file mode 100644
index 000000000..71881c2d5
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_noints The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int quarterPoints = (num_points-1) / 4;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m128 upperBound = _mm_set_ps1(bound);
+ __m128 lowerBound = _mm_set_ps1(-bound);
+ __m128 next3old1;
+ __m128 next4;
+ __m128 boundAdjust;
+ __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+ __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+ // Do the first 4 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < quarterPoints; number++) {
+ // Load data
+ next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+ next4 = _mm_load_ps(inPtr);
+ inPtr += 4;
+ // Subtract and store:
+ next3old1 = _mm_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm_cmplt_ps(next3old1, lowerBound);
+ next4 = _mm_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm_add_ps(next3old1, boundAdjust);
+ _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+ outPtr += 4;
+ }
+
+ for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+ \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+ \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+ \param num_points The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 0;
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+
+ // Do the first 1 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+
+ for (number = 1; number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
new file mode 100644
index 000000000..bf05a882d
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -0,0 +1,168 @@
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaning count
+ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(realDataPoints[number] <= meanAmplitude){
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if(validBinCount > 0.0){
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ }
+ else{
+ localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the spectral noise floor of an input power spectrum
+
+ Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor.
+
+ \param realDataPoints The input power spectrum
+ \param num_points The number of data points in the input power spectrum vector
+ \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+ \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for(number = 0; number < num_points; number++){
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for(number = 0; number < num_points; number++){
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
+
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_convert_16i.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
new file mode 100644
index 000000000..9fd758655
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -0,0 +1,302 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_convert_32i.h b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
new file mode 100644
index 000000000..1a46093ee
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -0,0 +1,331 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_convert_8i.h b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
new file mode 100644
index 000000000..b45150522
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -0,0 +1,312 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
new file mode 100644
index 000000000..2dd86a17c
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -0,0 +1,221 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_normalize.h b/volk/kernels/volk/volk_32f_s32f_normalize.h
new file mode 100644
index 000000000..a0bd33c7d
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_normalize.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_s32f_normalize_a_H
+#define INCLUDED_volk_32f_s32f_normalize_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
+ \param vecBuffer The buffer of values to be vectorized
+ \param num_points The number of values in vecBuffer
+ \param scalar The scale value to be applied to each buffer value
+*/
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m128 vecScalar = _mm_set_ps1(invScalar);
+
+ __m128 input1;
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+
+ input1 = _mm_load_ps(inputPtr);
+
+ input1 = _mm_mul_ps(input1, vecScalar);
+
+ _mm_store_ps(inputPtr, input1);
+
+ inputPtr += 4;
+ }
+
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Normalizes the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be normalizeed
+ \param bVector One of the vectors to be normalizeed
+ \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_power_32f.h b/volk/kernels/volk/volk_32f_s32f_power_32f.h
new file mode 100644
index 000000000..282244468
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_power_32f.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
+#define INCLUDED_volk_32f_s32f_power_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_s32f_stddev_32f.h b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
new file mode 100644
index 000000000..0622b278a
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -0,0 +1,145 @@
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation of the input buffer using the supplied mean
+ \param stddev The calculated standard deviation
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param mean The mean of the input buffer
+ \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+ float returnValue = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h b/volk/kernels/volk/volk_32f_sqrt_32f.h
new file mode 100644
index 000000000..ab9fffd7d
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
+#define INCLUDED_volk_32f_sqrt_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_sqrt_ps(aVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points){
+ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
new file mode 100644
index 000000000..9bded6713
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -0,0 +1,170 @@
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the standard deviation and mean of the input buffer
+ \param stddev The calculated standard deviation
+ \param mean The mean of the input buffer
+ \param inputBuffer The buffer of points to calculate the std deviation for
+ \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_add_32f.h b/volk/kernels/volk/volk_32f_x2_add_32f.h
new file mode 100644
index 000000000..42278f606
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_add_32f.h
@@ -0,0 +1,147 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
+#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
+#define INCLUDED_volk_32f_x2_add_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_divide_32f.h b/volk/kernels/volk/volk_32f_x2_divide_32f.h
new file mode 100644
index 000000000..d5a7c7d7c
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_divide_32f.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
+#define INCLUDED_volk_32f_x2_divide_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_div_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Divides the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be divideed
+ \param bVector The divisor vector
+ \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
new file mode 100644
index 000000000..8fcc7deae
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
new file mode 100644
index 000000000..b91252e36
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -0,0 +1,580 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
new file mode 100644
index 000000000..0935cb32b
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_max_32f.h b/volk/kernels/volk/volk_32f_x2_max_32f.h
new file mode 100644
index 000000000..27633acae
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_max_32f.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_max_32f_a_H
+#define INCLUDED_volk_32f_x2_max_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_max_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h b/volk/kernels/volk/volk_32f_x2_min_32f.h
new file mode 100644
index 000000000..4773d1321
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_min_32f_a_H
+#define INCLUDED_volk_32f_x2_min_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_min_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
new file mode 100644
index 000000000..9fdbec0a2
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -0,0 +1,226 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
new file mode 100644
index 000000000..ce7b91a31
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be interleaved
+ */
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
new file mode 100644
index 000000000..8ea491f98
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
new file mode 100644
index 000000000..e975f14e9
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -0,0 +1,152 @@
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*4;
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+
+ xmm0 = _mm_load1_ps(&center_point_array[0]);
+ xmm6 = _mm_load1_ps(&center_point_array[1]);
+ xmm7 = _mm_load1_ps(&center_point_array[2]);
+ xmm8 = _mm_load1_ps(&center_point_array[3]);
+ //xmm11 = _mm_load1_ps(&center_point_array[4]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 2) & 3;
+ int i = 0;
+
+ for(; i < bound; ++i) {
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+ //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+ //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+
+ //xmm9 = _mm_add_ps(xmm12, xmm9);
+ }
+
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+ _mm_store_ss(&result, xmm4);
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
+ //center_point_array[4] * fith);
+ }
+
+ result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+ target[0] = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*4;
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+
+
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 2; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ //center_point_array[4] * fith);
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth) +
+ //center_point_array[4] * fith) +
+ (center_point_array[4]), i);
+ */
+ }
+
+ result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+
+
+ *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
new file mode 100644
index 000000000..e0a8a59ce
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ *realpt = 0;
+ *imagpt = 0;
+
+ for(number = 0; number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+
+ x0Val = _mm_load_ps(bPtr);
+ x1Val = _mm_load_ps(bPtr);
+ x2Val = _mm_load_ps(bPtr+4);
+ x3Val = _mm_load_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
new file mode 100644
index 000000000..104e3250e
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal1 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ aVal2 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ bVal = _mm_load_ps(bPtr);
+ bPtr += 4;
+
+ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+ cVal = _mm_mul_ps(aVal1, bVal1);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+ cPtr += 2;
+
+ cVal = _mm_mul_ps(aVal2, bVal2);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr);
+ bPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
new file mode 100644
index 000000000..dce897ff5
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -0,0 +1,128 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_store_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
new file mode 100644
index 000000000..0d33ed7e2
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
new file mode 100644
index 000000000..4a4c5509b
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
new file mode 100644
index 000000000..b1968296f
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in q1q2q3q4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(qBufferPtr, iValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ for(number = 0; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
new file mode 100644
index 000000000..3d5759813
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
new file mode 100644
index 000000000..1fa66e8ad
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_index_max_16u.h b/volk/kernels/volk/volk_32fc_index_max_16u.h
new file mode 100644
index 000000000..c8d721240
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_index_max_16u.h
@@ -0,0 +1,218 @@
+#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
+#define INCLUDED_volk_32fc_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+
+static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+
+
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+
+ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+ xmm9 = xmm8 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
+;
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+ for(; i < bound; ++i) {
+
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+
+ src0 += 4;
+
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+
+ }
+
+
+ for(i = 0; i < leftovers0; ++i) {
+
+
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ }
+
+
+
+
+ for(i = 0; i < leftovers1; ++i) {
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+
+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ }
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+
+
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ unsigned int l0 = g0 ^ 1;
+
+ unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ unsigned int l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
+
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float sq_dist = 0.0;
+ float max = 0.0;
+ unsigned int index = 0;
+
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+
+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+
+
+ }
+ target[0] = index;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
new file mode 100644
index 000000000..64e99cc1b
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -0,0 +1,250 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
new file mode 100644
index 000000000..0af81401a
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -0,0 +1,228 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
new file mode 100644
index 000000000..b076ab44e
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_blendv_ps(correctVector, phase, keepMask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The byte-aligned vector where the results will be stored.
+ \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 mask;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_and_ps(phase, keepMask);
+ mask = _mm_andnot_ps(keepMask, correctVector);
+ phase = _mm_or_ps(phase, mask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief performs the atan2 on the input vector and stores the results in the output vector.
+ \param outputVector The vector where the results will be stored.
+ \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
+ \param normalizeFactor The atan2 results will be divided by this normalization factor.
+ \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+ float* outPtr = outputVector;
+ const float* inPtr = (float*)inputVector;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number;
+ for ( number = 0; number < num_points; number++) {
+ const float real = *inPtr++;
+ const float imag = *inPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
new file mode 100644
index 000000000..9e10217a0
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ iValue = _mm_mul_ps(iValue, vScalar);
+
+ _mm_store_ps(floatBuffer, iValue);
+ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ iBufferPtr = &iBuffer[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+ \param complexVector The complex input vector
+ \param scalar The value to be multiply against each of the input values
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
new file mode 100644
index 000000000..09abd967d
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -0,0 +1,159 @@
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param scalar The scale value multiplied to the magnitude of each complex vector
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+ volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_power_32fc.h b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
new file mode 100644
index 000000000..d4a1d1746
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H
+#define INCLUDED_volk_32fc_s32f_power_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+//! raise a complex float to a real float power
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power){
+ const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
+ const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
+ return mag*lv_cmake(cosf(arg), sinf(arg));
+}
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ unsigned int number = 0;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+
+ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ cplxValue2 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ // Convert to polar coordinates
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+ magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+
+ // Now calculate the power of the polar coordinate data
+ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+ // Convert back to cartesian coordinates
+ iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+ qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+ _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+
+ cPtr += 2;
+
+ _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector of values to be taken to a power
+ \param power The power value to be applied to each data point
+ \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+ */
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
new file mode 100644
index 000000000..f76d9d35e
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Calculates the log10 power value for each input point
+ \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the log10 power value for each input point
+ \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided agains all the input values before the power is calculated
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
new file mode 100644
index 000000000..e73eb09f8
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Calculates the log10 power value divided by the RBW for each input point
+ \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param rbw The resolution bandwith of the fft spectrum
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invRBW = _mm_set_ps1(iRBW);
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Divide by the rbw
+ power = _mm_mul_ps(power, invRBW);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the log10 power value divided by the RBW for each input point
+ \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input values before the power is calculated
+ \param rbw The resolution bandwith of the fft spectrum
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ unsigned int point;
+ const float invRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
new file mode 100644
index 000000000..668a04760
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -0,0 +1,178 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
new file mode 100644
index 000000000..ab6b7fb1d
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -0,0 +1,74 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points);
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+
+
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_avx(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
new file mode 100644
index 000000000..ffbbdff69
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -0,0 +1,257 @@
+#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define ROTATOR_RELOAD 512
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ unsigned int i = 0;
+ int j = 0;
+ for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+ (*phase) /= abs((*phase));
+ }
+ for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+ for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%2; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*!
+ \brief rotate input vector at fixed rate per sample from initial phase offset
+ \param outVector The vector where the results will be stored
+ \param inVector Vector to be rotated
+ \param phase_inc rotational velocity
+ \param phase initial phase offset
+ \param num_points The number of values in inVector to be rotated and stored into cVector
+*/
+
+
+
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+
+ for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ phase_Val = _mm256_div_ps(phase_Val, tmp1);
+ }
+ for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
new file mode 100644
index 000000000..e6ccf5c38
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -0,0 +1,500 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include<volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <mmintrin.h>
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int num_bytes = num_points*8;
+
+ // Variable never used?
+ //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ union HalfMask {
+ uint32_t intRep[4];
+ __m128 vec;
+ } halfMask;
+
+ union NegMask {
+ int intRep[4];
+ __m128 vec;
+ } negMask;
+
+ unsigned int offset = 0;
+ float Rsum=0, Isum=0;
+ float Im,Re;
+
+ __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
+ __m128 zv = {0,0,0,0};
+
+ halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
+ halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
+
+ negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
+ negMask.intRep[1] = negMask.intRep[3] = 0;
+
+ // main loop
+ while(num_bytes >= 4*sizeof(float)){
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_mul_ps(in1, in2);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_mul_ps(in1, fehg);
+ Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ num_bytes -= 4*sizeof(float);
+ offset += 2;
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ // handle the last complex case ...
+ if(num_bytes > 0){
+
+ if(num_bytes != 4){
+ // bad things are happening
+ }
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
+ Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ result[0] = lv_cmake(Rsum,Isum);
+ return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+
+
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+
+
+
+ asm volatile
+ (
+ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %[conjugator], %%r9\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movaps 0(%%r9), %%xmm8\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " xorps %%xmm8, %%xmm3\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ int getem = num_bytes % 16;
+
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+
+ }
+
+ return;
+}
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ int bound = num_bytes >> 4;
+ int leftovers = num_bytes % 16;
+
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " #movl 12(%%ebp), %%eax # input\n\t"
+ " #movl 16(%%ebp), %%edx # taps\n\t"
+ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%[eax]), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%[edx]), %%xmm2\n\t"
+ " movl %[ecx], (%[out])\n\t"
+ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+ " xorps %%xmm1, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%[eax]), %%xmmA\n\t"
+ "# movaps (%[edx]), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%[edx]), %%xmm3\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " xorps %%xmm1, %%xmm3\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " movaps 16(%[eax]), %%xmm1\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%[eax]), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %[eax]\n\t"
+ " movaps 32(%[edx]), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " xorps %%xmm1, %%xmm2\n\t"
+ " addl $32, %[edx]\n\t"
+ ".%=L1_test:\n\t"
+ " decl %[ecx]\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %[ecx]\n\t"
+ " andl $1, %[ecx]\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " #movl 8(%%ebp), %[eax] \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%[out])\n\t"
+ " movss (%[out]), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %[eax] # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+ );
+
+
+
+
+ printf("%d, %d\n", leftovers, bound);
+
+ for(; leftovers > 0; leftovers -= 8) {
+
+
+ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+
+ }
+
+ return;
+
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
new file mode 100644
index 000000000..066bed443
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -0,0 +1,562 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[num_points - 1] * taps[num_points - 1];
+
+ }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(num_points % 1 != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ for(i = 0; i < isodd; ++i) {
+ *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(((num_bytes >> 3) & 1)) {
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+ }
+
+ return;
+
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+
+#if 0
+ const unsigned int num_bytes = num_points*8;
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " movl 12(%%ebp), %%eax # input\n\t"
+ " movl 16(%%ebp), %%edx # taps\n\t"
+ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%eax), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%edx), %%xmm2\n\t"
+ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%eax), %%xmmA\n\t"
+ "# movaps (%%edx), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%eax), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%edx), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%eax), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %%eax\n\t"
+ " movaps 32(%%edx), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " addl $32, %%edx\n\t"
+ ".%=L1_test:\n\t"
+ " decl %%ecx\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %%ecx\n\t"
+ " andl $1, %%ecx\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " movl 8(%%ebp), %%eax \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%%eax)\n\t"
+ " movss (%%eax), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %%eax # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ :
+ : "eax", "ecx", "edx"
+ );
+
+
+ int getem = num_bytes % 16;
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+
+ }
+
+ return;
+#endif
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_bytes >> 4;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(((num_bytes >> 3) & 1) != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ int i = 0;
+
+ int bound = (num_bytes >> 5);
+ int leftovers = (num_bytes & 24) >> 3;
+
+ real0 = _mm_sub_ps(real0, real0);
+ real1 = _mm_sub_ps(real1, real1);
+ im0 = _mm_sub_ps(im0, im0);
+ im1 = _mm_sub_ps(im1, im1);
+
+ for(; i < bound; ++i) {
+
+
+ xmm0 = _mm_load_ps(p_input);
+ xmm1 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_load_ps(p_input);
+ xmm3 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
new file mode 100644
index 000000000..7db68c1bd
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -0,0 +1,170 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
new file mode 100644
index 000000000..cfd6c007f
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -0,0 +1,162 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
new file mode 100644
index 000000000..cb2e94501
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ lv_32fc_t diff;
+ memset(&diff, 0x0, 2*sizeof(float));
+
+ float sq_dist = 0.0;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm8 = _mm_load1_ps(&scalar);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+
+ for(; i < bound - 1; ++i) {
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+
+ for(i = 0; i < leftovers0; ++i) {
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
new file mode 100644
index 000000000..27a081b7c
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -0,0 +1,116 @@
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+
+ for(; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ for(i = 0; i < leftovers0; ++i) {
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32i_s32f_convert_32f.h b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
new file mode 100644
index 000000000..7a0988345
--- /dev/null
+++ b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_store_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/kernels/volk/volk_32i_x2_and_32i.h b/volk/kernels/volk/volk_32i_x2_and_32i.h
new file mode 100644
index 000000000..54ecb7981
--- /dev/null
+++ b/volk/kernels/volk/volk_32i_x2_and_32i.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_and_32i_a_H
+#define INCLUDED_volk_32i_x2_and_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_and_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] & bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
diff --git a/volk/kernels/volk/volk_32i_x2_or_32i.h b/volk/kernels/volk/volk_32i_x2_or_32i.h
new file mode 100644
index 000000000..acadd5a57
--- /dev/null
+++ b/volk/kernels/volk/volk_32i_x2_or_32i.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
+#define INCLUDED_volk_32i_x2_or_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_or_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] | bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Ors the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be ored
+ \param bVector One of the vectors to be ored
+ \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
diff --git a/volk/kernels/volk/volk_32u_byteswap.h b/volk/kernels/volk/volk_32u_byteswap.h
new file mode 100644
index 000000000..8f6e3ad7b
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_byteswap.h
@@ -0,0 +1,154 @@
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
+#ifndef INCLUDED_volk_32u_byteswap_a_H
+#define INCLUDED_volk_32u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_a_H */
diff --git a/volk/kernels/volk/volk_32u_popcnt.h b/volk/kernels/volk/volk_32u_popcnt.h
new file mode 100644
index 000000000..978356972
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_popcnt.h
@@ -0,0 +1,36 @@
+#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+#define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) {
+
+ // This is faster than a lookup table
+ uint32_t retVal = value;
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+ *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) {
+ *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
diff --git a/volk/kernels/volk/volk_64f_convert_32f.h b/volk/kernels/volk/volk_64f_convert_32f.h
new file mode 100644
index 000000000..c27526ffa
--- /dev/null
+++ b/volk/kernels/volk/volk_64f_convert_32f.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/kernels/volk/volk_64f_x2_max_64f.h b/volk/kernels/volk/volk_64f_x2_max_64f.h
new file mode 100644
index 000000000..f9a04c2c4
--- /dev/null
+++ b/volk/kernels/volk/volk_64f_x2_max_64f.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_max_64f_a_H
+#define INCLUDED_volk_64f_x2_max_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_max_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
diff --git a/volk/kernels/volk/volk_64f_x2_min_64f.h b/volk/kernels/volk/volk_64f_x2_min_64f.h
new file mode 100644
index 000000000..c77ca87fb
--- /dev/null
+++ b/volk/kernels/volk/volk_64f_x2_min_64f.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_min_64f_a_H
+#define INCLUDED_volk_64f_x2_min_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_min_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */
diff --git a/volk/kernels/volk/volk_64u_byteswap.h b/volk/kernels/volk/volk_64u_byteswap.h
new file mode 100644
index 000000000..e05daf6d5
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -0,0 +1,176 @@
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
+#ifndef INCLUDED_volk_64u_byteswap_a_H
+#define INCLUDED_volk_64u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_a_H */
diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h
new file mode 100644
index 000000000..466cfa5da
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_popcnt.h
@@ -0,0 +1,52 @@
+#ifndef INCLUDED_volk_64u_popcnt_a_H
+#define INCLUDED_volk_64u_popcnt_a_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) {
+
+ //const uint32_t* valueVector = (const uint32_t*)&value;
+
+ // This is faster than a lookup table
+ //uint32_t retVal = valueVector[0];
+ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFF);
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ uint64_t retVal64 = retVal;
+
+ //retVal = valueVector[1];
+ retVal = (uint32_t)((value & 0xFFFFFFFF00000000) >> 31);
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ retVal64 += retVal;
+
+ *ret = retVal64;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) {
+ *ret = _mm_popcnt_u64(value);
+
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/volk/kernels/volk/volk_8i_convert_16i.h b/volk/kernels/volk/volk_8i_convert_16i.h
new file mode 100644
index 000000000..3e5c92723
--- /dev/null
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
+#ifndef INCLUDED_volk_8i_convert_16i_a_H
+#define INCLUDED_volk_8i_convert_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/kernels/volk/volk_8i_s32f_convert_32f.h b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
new file mode 100644
index 000000000..bd7ff82d9
--- /dev/null
+++ b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -0,0 +1,200 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1( iScalar );
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
new file mode 100644
index 000000000..b59d22d18
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i complexVal, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+ iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+ qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+ qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
new file mode 100644
index 000000000..82cedb2bb
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i complexVal, outputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ outputVal = _mm_cvtepi8_epi16(complexVal);
+ outputVal = _mm_slli_epi16(outputVal, 7);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
new file mode 100644
index 000000000..c8ff18e67
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, outputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+ outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
new file mode 100644
index 000000000..9e244c8fc
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -0,0 +1,165 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue, qFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 4;
+
+ qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int8_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
diff --git a/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
new file mode 100644
index 000000000..56a1adcbb
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+
+ complexVal = _mm_srli_si128(complexVal, 4);
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I float vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param scalar The scaling value being multiplied against each data point
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
diff --git a/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
new file mode 100644
index 000000000..685a21ddc
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -0,0 +1,101 @@
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ int16_t* c16Ptr = (int16_t*)cVector;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number =0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
diff --git a/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
new file mode 100644
index 000000000..edb52ff50
--- /dev/null
+++ b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ __m128 ret;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ a += 4;
+ b += 4;
+ }
+
+ number = quarterPoints * 4;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* cPtr = (float*)cVector;
+ const float invScalar = 1.0 / scalar;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number = 0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cPtr++ = (lv_creal(temp) * invScalar);
+ *cPtr++ = (lv_cimag(temp) * invScalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
new file mode 100644
index 000000000..68fadc35b
--- /dev/null
+++ b/volk/lib/CMakeLists.txt
@@ -0,0 +1,352 @@
+#
+# Copyright 2011-2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# header file detection
+########################################################################
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+if(HAVE_CPUID_H)
+ add_definitions(-DHAVE_CPUID_H)
+endif()
+
+CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
+if(HAVE_INTRIN_H)
+ add_definitions(-DHAVE_INTRIN_H)
+endif()
+
+CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
+if(HAVE_FENV_H)
+ add_definitions(-DHAVE_FENV_H)
+endif()
+
+CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
+if(HAVE_DLFCN_H)
+ add_definitions(-DHAVE_DLFCN_H)
+ list(APPEND volk_libraries ${CMAKE_DL_LIBS})
+endif()
+
+########################################################################
+# Setup the compiler name
+########################################################################
+set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
+if(MSVC) #its not set otherwise
+ set(COMPILER_NAME MSVC)
+endif()
+
+message(STATUS "Compiler name: ${COMPILER_NAME}")
+
+if(NOT DEFINED COMPILER_NAME)
+ message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
+endif()
+
+########################################################################
+# Special clang flag so flag checks can fail
+########################################################################
+if(COMPILER_NAME MATCHES "GNU")
+ include(CheckCXXCompilerFlag)
+ CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ if(HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument")
+ endif()
+endif()
+
+########################################################################
+# detect x86 flavor of CPU
+########################################################################
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$")
+ message(STATUS "x86* CPU detected")
+ set(CPU_IS_x86 TRUE)
+endif()
+
+########################################################################
+# determine passing architectures based on compile flag tests
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "arch_flags" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+macro(check_arch arch_name)
+ set(flags ${ARGN})
+ set(have_${arch_name} TRUE)
+ foreach(flag ${flags})
+ include(CheckCXXCompilerFlag)
+ set(have_flag have${flag})
+ execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary)
+ COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
+ OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(VOLK_FLAG_CHECK_FLAGS)
+ set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
+ endif()
+ CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
+ unset(CMAKE_REQUIRED_FLAGS)
+ if (NOT ${have_flag})
+ set(have_${arch_name} FALSE)
+ endif()
+ endforeach(flag)
+ if (have_${arch_name})
+ list(APPEND available_archs ${arch_name})
+ endif()
+endmacro(check_arch)
+
+foreach(line ${arch_flag_lines})
+ string(REGEX REPLACE "," ";" arch_flags ${line})
+ check_arch(${arch_flags})
+endforeach(line)
+
+macro(OVERRULE_ARCH arch reason)
+ message(STATUS "${reason}, Overruled arch ${arch}")
+ list(REMOVE_ITEM available_archs ${arch})
+endmacro(OVERRULE_ARCH)
+
+########################################################################
+# eliminate AVX on GCC < 4.4
+# even though it accepts -mavx, as won't assemble xgetbv, which we need
+########################################################################
+if(CPU_IS_x86 AND COMPILER_NAME MATCHES "GNU")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
+ OUTPUT_VARIABLE GCC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(GCC_VERSION VERSION_LESS "4.4")
+ OVERRULE_ARCH(avx "GCC missing xgetbv")
+ endif()
+endif()
+
+########################################################################
+# implement overruling in the ORC case,
+# since ORC always passes flag detection
+########################################################################
+if(NOT ORC_FOUND)
+ OVERRULE_ARCH(orc "ORC support not found")
+endif()
+
+########################################################################
+# implement overruling in the non-multilib case
+# this makes things work when both -m32 and -m64 pass
+########################################################################
+if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
+ include(CheckTypeSize)
+ check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
+ if (${SIZEOF_CPU} EQUAL 64)
+ OVERRULE_ARCH(32 "CPU width is 64 bits")
+ endif()
+ if (${SIZEOF_CPU} EQUAL 32)
+ OVERRULE_ARCH(64 "CPU width is 32 bits")
+ endif()
+
+ #MSVC 64 bit does not have MMX, overrule it
+ if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+ OVERRULE_ARCH(mmx "No MMX for Win64")
+ endif()
+
+endif()
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available architectures: ${available_archs}")
+
+########################################################################
+# determine available machines given the available architectures
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machines" --archs "${available_archs}"
+ OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+########################################################################
+# Implement machine overruling for redundant machines:
+# A machine is redundant when expansion rules occur,
+# and the arch superset passes configuration checks.
+# When this occurs, eliminate the redundant machines
+# to avoid unnecessary compilation of subset machines.
+########################################################################
+foreach(arch mmx orc 64 32)
+ foreach(machine_name ${available_machines})
+ string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
+ if (${machine_name} STREQUAL ${machine_name_no_arch})
+ else()
+ list(REMOVE_ITEM available_machines ${machine_name_no_arch})
+ endif()
+ endforeach(machine_name)
+endforeach(arch)
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available machines: ${available_machines}")
+
+########################################################################
+# Create rules to run the volk generator
+########################################################################
+
+#dependencies are all python, xml, and header implementation files
+file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
+file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
+file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h)
+
+macro(gen_template tmpl output)
+ list(APPEND volk_gen_sources ${output})
+ add_custom_command(
+ OUTPUT ${output}
+ DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_tmpl_utils.py
+ --input ${tmpl} --output ${output} ${ARGN}
+ )
+endmacro(gen_template)
+
+make_directory(${CMAKE_BINARY_DIR}/include/volk)
+
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_cpu.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_machines.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_machines.c)
+
+foreach(machine_name ${available_machines})
+ #generate machine source
+ set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c)
+ gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name})
+
+ #determine machine flags
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(${machine_name}_flags)
+ set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+ endif()
+
+ #add to available machine defs
+ string(TOUPPER LV_MACHINE_${machine_name} machine_def)
+ list(APPEND machine_defs ${machine_def})
+endforeach(machine_name)
+
+########################################################################
+# Set local include directories first
+########################################################################
+include_directories(
+ ${CMAKE_BINARY_DIR}/include
+ ${CMAKE_SOURCE_DIR}/include
+ ${CMAKE_SOURCE_DIR}/kernels
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+########################################################################
+# Handle orc support
+########################################################################
+if(ORC_FOUND)
+ #setup orc library usage
+ include_directories(${ORC_INCLUDE_DIRS})
+ link_directories(${ORC_LIBRARY_DIRS})
+ list(APPEND volk_libraries ${ORC_LIBRARIES})
+
+ #setup orc functions
+ file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc)
+ foreach(orc_file ${orc_files})
+
+ #extract the name for the generated c source from the orc file
+ get_filename_component(orc_file_name_we ${orc_file} NAME_WE)
+ set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c)
+
+ #create a rule to generate the source and add to the list of sources
+ add_custom_command(
+ COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file}
+ DEPENDS ${orc_file} OUTPUT ${orcc_gen}
+ )
+ list(APPEND volk_sources ${orcc_gen})
+
+ endforeach(orc_file)
+else()
+ message(STATUS "Did not find liborc and orcc, disabling orc support...")
+endif()
+
+########################################################################
+# Setup the volk sources list and library
+########################################################################
+if(NOT WIN32)
+ add_definitions(-fvisibility=hidden)
+endif()
+
+list(APPEND volk_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_prefs.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_rank_archs.c
+ ${volk_gen_sources}
+)
+
+#set the machine definitions where applicable
+set_source_files_properties(
+ ${CMAKE_CURRENT_BINARY_DIR}/volk.c
+ ${CMAKE_CURRENT_BINARY_DIR}/volk_machines.c
+PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
+
+if(MSVC)
+ #add compatibility includes for stdint types
+ include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
+ add_definitions(-DHAVE_CONFIG_H)
+ #compile the sources as C++ due to the lack of complex.h under MSVC
+ set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
+#create the volk runtime library
+add_library(volk SHARED ${volk_sources})
+target_link_libraries(volk ${volk_libraries})
+set_target_properties(volk PROPERTIES SOVERSION ${LIBVER})
+set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
+
+install(TARGETS volk
+ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file
+ RUNTIME DESTINATION bin COMPONENT "volk_runtime" # .dll file
+)
+
+########################################################################
+# Build the QA test application
+########################################################################
+
+
+if(Boost_FOUND)
+
+ set_source_files_properties(
+ ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES
+ COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN"
+ )
+
+ include_directories(${Boost_INCLUDE_DIRS})
+ link_directories(${Boost_LIBRARY_DIRS})
+
+ add_executable(test_all
+ ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
+ )
+ target_link_libraries(test_all volk ${Boost_LIBRARIES})
+ add_test(qa_volk_test_all test_all)
+
+endif(Boost_FOUND)
+
diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h
new file mode 100644
index 000000000..3c3f47b00
--- /dev/null
+++ b/volk/lib/gcc_x86_cpuid.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3 (1 << 0)
+#define bit_PCLMUL (1 << 1)
+#define bit_SSSE3 (1 << 9)
+#define bit_FMA (1 << 12)
+#define bit_CMPXCHG16B (1 << 13)
+#define bit_SSE4_1 (1 << 19)
+#define bit_SSE4_2 (1 << 20)
+#define bit_MOVBE (1 << 22)
+#define bit_POPCNT (1 << 23)
+#define bit_AES (1 << 25)
+#define bit_XSAVE (1 << 26)
+#define bit_OSXSAVE (1 << 27)
+#define bit_AVX (1 << 28)
+#define bit_F16C (1 << 29)
+#define bit_RDRND (1 << 30)
+
+/* %edx */
+#define bit_CMPXCHG8B (1 << 8)
+#define bit_CMOV (1 << 15)
+#define bit_MMX (1 << 23)
+#define bit_FXSAVE (1 << 24)
+#define bit_SSE (1 << 25)
+#define bit_SSE2 (1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM (1 << 0)
+#define bit_ABM (1 << 5)
+#define bit_SSE4a (1 << 6)
+#define bit_XOP (1 << 11)
+#define bit_LWP (1 << 15)
+#define bit_FMA4 (1 << 16)
+#define bit_TBM (1 << 21)
+
+/* %edx */
+#define bit_MMXEXT (1 << 22)
+#define bit_LM (1 << 29)
+#define bit_3DNOWP (1 << 30)
+#define bit_3DNOW (1 << 31)
+
+/* Extended Features (%eax == 7) */
+#define bit_FSGSBASE (1 << 0)
+#define bit_BMI (1 << 3)
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register. */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("xchgl\t%%ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchgl\t%%ebx, %1\n\t" \
+ : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d) \
+ __asm__ ("cpuid\n\t" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction. ext can
+ be either 0x0 or 0x8000000 to return highest supported value for
+ basic or extended cpuid information. Function returns 0 if cpuid
+ is not supported or whatever cpuid returns in eax register. If sig
+ pointer is non-null, then first four bytes of the signature
+ (as found in ebx register) are returned in location pointed by sig. */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+ unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+ /* See if we can use cpuid. On AMD64 we always can. */
+#if __GNUC__ >= 3
+ __asm__ ("pushf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "mov{l}\t{%0, %1|%1, %0}\n\t"
+ "xor{l}\t{%2, %0|%0, %2}\n\t"
+ "push{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ "pushf{l|d}\n\t"
+ "pop{l}\t%0\n\t"
+ "popf{l|d}\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+ nor alternatives in i386 code. */
+ __asm__ ("pushfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "movl\t%0, %1\n\t"
+ "xorl\t%2, %0\n\t"
+ "pushl\t%0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl\t%0\n\t"
+ "popfl\n\t"
+ : "=&r" (__eax), "=&r" (__ebx)
+ : "i" (0x00200000));
+#endif
+
+ if (!((__eax ^ __ebx) & 0x00200000))
+ return 0;
+#endif
+
+ /* Host supports cpuid. Return highest supported cpuid input value. */
+ __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+ if (__sig)
+ *__sig = __ebx;
+
+ return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+ eax, ebx, ecx and edx registers. The function checks if cpuid is
+ supported and returns 1 for valid cpuid information or 0 for
+ unsupported cpuid level. All pointers are required to be non-null. */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+ unsigned int *__eax, unsigned int *__ebx,
+ unsigned int *__ecx, unsigned int *__edx)
+{
+ unsigned int __ext = __level & 0x80000000;
+
+ if (__get_cpuid_max (__ext, 0) < __level)
+ return 0;
+
+ __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+ return 1;
+}
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..8da43b972
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3200;
+ const int ITERS = 100000;
+ __VOLK_ATTR_ALIGNED(16) short input0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input4[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output01[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output11[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output21[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output31[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+ short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ input4[i] = plus4 - minus4;
+
+ }
+ printf("16s_add_quad_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse2_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+ CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+ CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+ CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..5a58569a1
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+ const int num_iters = 1000000;
+ const int vlen = 32;
+
+ static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+ static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+ static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+ static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+ static char* permuters[4] = {permute0, permute1, permute2, permute3};
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ __VOLK_ATTR_ALIGNED(16) short target[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target3[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short src0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+ __VOLK_ATTR_ALIGNED(16) short cntl0[vlen] = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl1[vlen] = {
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl2[vlen] = {
+ 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+ __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = {
+ 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+ __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+
+ }
+
+
+ printf("16s_branch_4_state_8_aligned\n");
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time: %f\n", total);
+
+
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("branch_4_state_8_time, ssse3: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < num_iters; ++i) {
+ volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("permute_and_scalar_add_time, generic: %f\n", total);
+
+
+
+ for(int i = 0; i < vlen; ++i) {
+ printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ CPPUNIT_ASSERT(target[i] == target3[i]);
+ }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..dadd2c580
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+ const int vlen = 64;
+
+ unsigned int num_bytes = vlen << 1;
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ __VOLK_ATTR_ALIGNED(16) short target[vlen];
+ __VOLK_ATTR_ALIGNED(16) short target2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short src0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short cntl3[vlen];
+ __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
+
+ for(int i = 0; i < vlen; ++i) {
+ src0[i] = i;
+ permute_indexes[i] = (3 * i)%vlen;
+ cntl0[i] = 0xff;
+ cntl1[i] = 0xff * (i%2);
+ cntl2[i] = 0xff * ((i>>1)%2);
+ cntl3[i] = 0xff * ((i%4) == 3);
+ }
+
+ printf("16s_permute_and_scalar_add_aligned\n");
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("generic_time: %f\n", total);
+
+ start = clock();
+ for(int i = 0; i < 100000; ++i) {
+ volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+ }
+ end = clock();
+
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+ printf("sse2_time: %f\n", total);
+
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT(target[i] == target2[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..2a5dec44a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+ const int vlen = 34;
+
+ __VOLK_ATTR_ALIGNED(16) short input0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input1[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input2[vlen];
+ __VOLK_ATTR_ALIGNED(16) short input3[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) short output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) short output1[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ short plus0 = (short) (rand() - (RAND_MAX/2));
+ short plus1 = (short) (rand() - (RAND_MAX/2));
+ short plus2 = (short) (rand() - (RAND_MAX/2));
+ short plus3 = (short) (rand() - (RAND_MAX/2));
+
+ short minus0 = (short) (rand() - (RAND_MAX/2));
+ short minus1 = (short) (rand() - (RAND_MAX/2));
+ short minus2 = (short) (rand() - (RAND_MAX/2));
+ short minus3 = (short) (rand() - (RAND_MAX/2));
+
+ input0[i] = plus0 - minus0;
+ input1[i] = plus1 - minus1;
+ input2[i] = plus2 - minus2;
+ input3[i] = plus3 - minus3;
+ }
+
+ volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+ volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+ printf("16s_quad_max_star_aligned\n");
+ for(int i = 0; i < vlen; ++i) {
+ printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+
+ CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..4e792ec6c
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ __VOLK_ATTR_ALIGNED(16) float input0[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) float output0[vlen];
+ __VOLK_ATTR_ALIGNED(16) float output01[vlen];
+
+ for(int i = 0; i < vlen; ++i) {
+ input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+ }
+ printf("32f_fm_detect_aligned\n");
+
+ start = clock();
+ float save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ save = 0.1;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse_time: %f\n", total);
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..2df206726
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+ printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target_sse4_1;
+ unsigned int* target_sse;
+ unsigned int* target_generic;
+ float* src0 ;
+
+
+ unsigned int i_target_sse4_1;
+ target_sse4_1 = &i_target_sse4_1;
+ unsigned int i_target_sse;
+ target_sse = &i_target_sse;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+
+ ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+
+ random_floats((float*)src0, vlen);
+
+ printf("32f_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.1 time: %f\n", total);
+
+
+ printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+ CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..3859bcb52
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ unsigned int i = 0;
+ for (; i < n; i++) {
+
+ buf[i] = uniform () * 32767;
+
+ }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+
+ const int vlen = VEC_LEN;
+
+ volk_environment_init();
+ int ret;
+
+ unsigned int* target;
+ unsigned int* target_generic;
+ std::complex<float>* src0 ;
+
+
+ unsigned int i_target;
+ target = &i_target;
+ unsigned int i_target_generic;
+ target_generic = &i_target_generic;
+ ret = posix_memalign((void**)&src0, 16, vlen << 3);
+
+ random_floats((float*)src0, vlen * 2);
+
+ printf("32fc_index_max_aligned16\n");
+
+ clock_t start, end;
+ double total;
+
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic time: %f\n", total);
+
+ start = clock();
+ for(int k = 0; k < NUM_ITERS; ++k) {
+ volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+ }
+
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3 time: %f\n", total);
+
+
+
+
+ printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+
+
+
+ free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..daca31d9c
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+ printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+ const int vlen = 3201;
+ const int ITERS = 10000;
+ __VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen];
+
+ __VOLK_ATTR_ALIGNED(16) float output_generic[vlen];
+ __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen];
+
+ const float scalar = vlen;
+ const float rbw = 1.7;
+
+ float* inputLoad = (float*)input0;
+ for(int i = 0; i < 2*vlen; ++i) {
+ inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+ }
+ printf("32fc_power_spectral_density_32f_aligned\n");
+
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse3_time: %f\n", total);
+
+ for(int i = 0; i < 1; ++i) {
+ //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+ //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+ }
+
+ for(int i = 0; i < vlen; ++i) {
+ //printf("%d...%d\n", output0[i], output01[i]);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+ }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
new file mode 100644
index 000000000..b825c20e4
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
@@ -0,0 +1,138 @@
+#include <volk/volk.h>
+#include <qa_32fc_x2_conjugate_dot_prod_32fc_u.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+#define assertcomplexEqual(expected, actual, delta) \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+
+#define ERR_DELTA (1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ const int vlen = 789743;
+
+ volk_environment_init();
+ int ret;
+
+ std::complex<float>* input;
+ std::complex<float>* taps;
+
+ std::complex<float>* result_generic;
+ std::complex<float>* result;
+
+ ret = posix_memalign((void**)&input, 16, vlen << 3);
+ ret = posix_memalign((void**)&taps, 16, vlen << 3);
+ ret = posix_memalign((void**)&result_generic, 16, 8);
+ ret = posix_memalign((void**)&result, 16, 8);
+
+
+ result_generic[0] = std::complex<float>(0,0);
+ result[0] = std::complex<float>(0,0);
+
+ random_floats((float*)input, vlen * 2);
+ random_floats((float*)taps, vlen * 2);
+
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
+
+
+ volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32");
+
+ printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
+ printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+ assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+ free(input);
+ free(taps);
+ free(result_generic);
+ free(result);
+
+}
+
+
+#else
+
+void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
+ printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
new file mode 100644
index 000000000..f07402403
--- /dev/null
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..5559d933d
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ __VOLK_ATTR_ALIGNED(16) uint32_t input0;
+
+ __VOLK_ATTR_ALIGNED(16) uint32_t output0;
+ __VOLK_ATTR_ALIGNED(16) uint32_t output01;
+
+ input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("32u_popcnt_aligned\n");
+
+ start = clock();
+ uint32_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..391601f22
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+#include <ctime>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+ printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+
+
+ volk_runtime_init();
+
+ volk_environment_init();
+ clock_t start, end;
+ double total;
+
+ const int ITERS = 10000000;
+ __VOLK_ATTR_ALIGNED(16) uint64_t input0;
+
+ __VOLK_ATTR_ALIGNED(16) uint64_t output0;
+ __VOLK_ATTR_ALIGNED(16) uint64_t output01;
+
+ input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+ output0 = 0;
+ output01 = 0;
+
+ printf("64u_popcnt_aligned\n");
+
+ start = clock();
+ uint64_t ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+ output0 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("generic_time: %f\n", total);
+ start = clock();
+ ret = 0;
+ for(int count = 0; count < ITERS; ++count) {
+ get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+ output01 += ret;
+ }
+ end = clock();
+ total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ printf("sse4.2_time: %f\n", total);
+
+
+ CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+ CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+ CPPUNIT_TEST (t1);
+ CPPUNIT_TEST_SUITE_END ();
+
+ private:
+ void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
new file mode 100644
index 000000000..e526eb2d0
--- /dev/null
+++ b/volk/lib/qa_utils.cc
@@ -0,0 +1,477 @@
+#include "qa_utils.h"
+#include <cstring>
+#include <boost/foreach.hpp>
+#include <boost/assign/list_of.hpp>
+#include <boost/tokenizer.hpp>
+#include <iostream>
+#include <vector>
+#include <list>
+#include <ctime>
+#include <cmath>
+#include <limits>
+#include <boost/lexical_cast.hpp>
+#include <volk/volk.h>
+#include <volk/volk_cpu.h>
+#include <volk/volk_common.h>
+#include <boost/typeof/typeof.hpp>
+#include <boost/type_traits.hpp>
+#include <stdio.h>
+
+float uniform() {
+ return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
+}
+
+template <class t>
+void random_floats (t *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+void load_random_data(void *data, volk_type_t type, unsigned int n) {
+ if(type.is_complex) n *= 2;
+ if(type.is_float) {
+ if(type.size == 8) random_floats<double>((double *)data, n);
+ else random_floats<float>((float *)data, n);
+ } else {
+ float int_max = float(uint64_t(2) << (type.size*8));
+ if(type.is_signed) int_max /= 2.0;
+ for(unsigned int i=0; i<n; i++) {
+ float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
+ //man i really don't know how to do this in a more clever way, you have to cast down at some point
+ switch(type.size) {
+ case 8:
+ if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+ else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+ break;
+ case 4:
+ if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+ else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+ break;
+ case 2:
+ if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
+ else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
+ break;
+ case 1:
+ if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+ else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+ break;
+ default:
+ throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+ }
+ }
+ }
+}
+
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
+ std::vector<std::string> archlist;
+
+ for(size_t i = 0; i < desc.n_impls; i++) {
+ //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc
+ archlist.push_back(std::string(desc.impl_names[i]));
+ }
+
+ return archlist;
+}
+
+volk_type_t volk_type_from_string(std::string name) {
+ volk_type_t type;
+ type.is_float = false;
+ type.is_scalar = false;
+ type.is_complex = false;
+ type.is_signed = false;
+ type.size = 0;
+ type.str = name;
+
+ if(name.size() < 2) throw std::string("name too short to be a datatype");
+
+ //is it a scalar?
+ if(name[0] == 's') {
+ type.is_scalar = true;
+ name = name.substr(1, name.size()-1);
+ }
+
+ //get the data size
+ size_t last_size_pos = name.find_last_of("0123456789");
+ if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
+ //will throw if malformed
+ int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
+
+ assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+ type.size = size/8; //in bytes
+
+ for(size_t i=last_size_pos+1; i < name.size(); i++) {
+ switch (name[i]) {
+ case 'f':
+ type.is_float = true;
+ break;
+ case 'i':
+ type.is_signed = true;
+ break;
+ case 'c':
+ type.is_complex = true;
+ break;
+ case 'u':
+ type.is_signed = false;
+ break;
+ default:
+ throw;
+ }
+ }
+
+ return type;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::vector<volk_type_t> &outputsig,
+ std::string name) {
+ boost::char_separator<char> sep("_");
+ boost::tokenizer<boost::char_separator<char> > tok(name, sep);
+ std::vector<std::string> toked;
+ tok.assign(name);
+ toked.assign(tok.begin(), tok.end());
+
+ assert(toked[0] == "volk");
+ toked.erase(toked.begin());
+
+ //ok. we're assuming a string in the form
+ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+ std::string fn_name;
+ volk_type_t type;
+ BOOST_FOREACH(std::string token, toked) {
+ try {
+ type = volk_type_from_string(token);
+ if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+
+ if(side == SIDE_INPUT) inputsig.push_back(type);
+ else outputsig.push_back(type);
+ } catch (...){
+ if(token[0] == 'x') { //it's a multiplier
+ if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+ else assert(outputsig.size() > 0);
+ int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+ for(int i=1; i<multiplier; i++) {
+ if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+ else outputsig.push_back(outputsig.back());
+ }
+ }
+ else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+ side = SIDE_NAME;
+ fn_name.append("_");
+ fn_name.append(token);
+ }
+ else if(side == SIDE_OUTPUT) {
+ if(token != toked.back()) throw; //the last token in the name is the alignment
+ }
+ }
+ }
+ //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+ assert(inputsig.size() != 0);
+
+}
+
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision
+ if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+template <class t>
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ if(abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i])) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+class volk_qa_aligned_mem_pool{
+public:
+ void *get_new(size_t size){
+ size_t alignment = volk_get_alignment();
+ _mems.push_back(std::vector<char>(size + alignment-1, 0));
+ size_t ptr = size_t(&_mems.back().front());
+ return (void *)((ptr + alignment-1) & ~(alignment-1));
+ }
+private: std::list<std::vector<char> > _mems;
+};
+
+bool run_volk_tests(volk_func_desc_t desc,
+ void (*manual_func)(),
+ std::string name,
+ float tol,
+ lv_32fc_t scalar,
+ int vlen,
+ int iter,
+ std::vector<std::string> *best_arch_vector = 0,
+ std::string puppet_master_name = "NULL"
+ ) {
+ std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
+
+ //first let's get a list of available architectures for the test
+ std::vector<std::string> arch_list = get_arch_list(desc);
+
+ if(arch_list.size() < 2) {
+ std::cout << "no architectures to test" << std::endl;
+ return false;
+ }
+
+ //something that can hang onto memory and cleanup when this function exits
+ volk_qa_aligned_mem_pool mem_pool;
+
+ //now we have to get a function signature by parsing the name
+ std::vector<volk_type_t> inputsig, outputsig;
+ get_signatures_from_name(inputsig, outputsig, name);
+
+ //pull the input scalars into their own vector
+ std::vector<volk_type_t> inputsc;
+ for(size_t i=0; i<inputsig.size(); i++) {
+ if(inputsig[i].is_scalar) {
+ inputsc.push_back(inputsig[i]);
+ inputsig.erase(inputsig.begin() + i);
+ i -= 1;
+ }
+ }
+ //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
+ //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
+ std::vector<void *> inbuffs;
+ BOOST_FOREACH(volk_type_t sig, inputsig) {
+ if(!sig.is_scalar) //we don't make buffers for scalars
+ inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+ }
+ for(size_t i=0; i<inbuffs.size(); i++) {
+ load_random_data(inbuffs[i], inputsig[i], vlen);
+ }
+
+ //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+ std::vector<std::vector<void *> > test_data;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ std::vector<void *> arch_buffs;
+ for(size_t j=0; j<outputsig.size(); j++) {
+ arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+ }
+ for(size_t j=0; j<inputsig.size(); j++) {
+ arch_buffs.push_back(inbuffs[j]);
+ }
+ test_data.push_back(arch_buffs);
+ }
+
+ std::vector<volk_type_t> both_sigs;
+ both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+ both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+ //now run the test
+ clock_t start, end;
+ std::vector<double> profile_times;
+ for(size_t i = 0; i < arch_list.size(); i++) {
+ start = clock();
+
+ switch(both_sigs.size()) {
+ case 1:
+ if(inputsc.size() == 0) {
+ run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+ run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+ run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
+ default:
+ throw "no function handler for this signature";
+ break;
+ }
+
+ end = clock();
+ double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+ std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl;
+
+ profile_times.push_back(arch_time);
+ }
+
+ //and now compare each output to the generic output
+ //first we have to know which output is the generic one, they aren't in order...
+ size_t generic_offset=0;
+ for(size_t i=0; i<arch_list.size(); i++)
+ if(arch_list[i] == "generic") generic_offset=i;
+
+ //now compare
+ //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
+
+ bool fail = false;
+ bool fail_global = false;
+ std::vector<bool> arch_results;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ fail = false;
+ if(i != generic_offset) {
+ for(size_t j=0; j<both_sigs.size(); j++) {
+ if(both_sigs[j].is_float) {
+ if(both_sigs[j].size == 8) {
+ fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ } else {
+ //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+ switch(both_sigs[j].size) {
+ case 8:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 4:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 2:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ case 1:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ } else {
+ fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+ }
+ break;
+ default:
+ fail=1;
+ }
+ }
+ if(fail) {
+ fail_global = true;
+ std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+ }
+ //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
+ }
+ }
+ arch_results.push_back(!fail);
+ }
+
+ double best_time_a = std::numeric_limits<double>::max();
+ double best_time_u = std::numeric_limits<double>::max();
+ std::string best_arch_a = "generic";
+ std::string best_arch_u = "generic";
+ for(size_t i=0; i < arch_list.size(); i++)
+ {
+ if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+ {
+ best_time_u = profile_times[i];
+ best_arch_u = arch_list[i];
+ }
+ if((profile_times[i] < best_time_a) && arch_results[i])
+ {
+ best_time_a = profile_times[i];
+ best_arch_a = arch_list[i];
+ }
+ }
+
+ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
+ if(best_arch_vector) {
+ if(puppet_master_name == "NULL") {
+ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
+ }
+ else {
+ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
+ }
+ }
+
+ return fail_global;
+}
+
+
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
new file mode 100644
index 000000000..0f17cdaa3
--- /dev/null
+++ b/volk/lib/qa_utils.h
@@ -0,0 +1,41 @@
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <volk/volk.h>
+#include <volk/volk_common.h>
+
+struct volk_type_t {
+ bool is_float;
+ bool is_scalar;
+ bool is_signed;
+ bool is_complex;
+ int size;
+ std::string str;
+};
+
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float *buf, unsigned n);
+
+bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
+
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
+#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+
+#endif //VOLK_QA_UTILS_H
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
new file mode 100644
index 000000000..f133897cb
--- /dev/null
+++ b/volk/lib/testqa.cc
@@ -0,0 +1,90 @@
+#include "qa_utils.h"
+#include <volk/volk.h>
+#include <boost/test/unit_test.hpp>
+
+//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
+//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000);
+//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000);
+VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c
new file mode 100644
index 000000000..f787b5e2a
--- /dev/null
+++ b/volk/lib/volk_prefs.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <volk/volk_prefs.h>
+
+//#if defined(_WIN32)
+//#include <Windows.h>
+//#endif
+
+void volk_get_config_path(char *path)
+{
+ const char *suffix = "/.volk/volk_config";
+ char *home = NULL;
+ if (home == NULL) home = getenv("HOME");
+ if (home == NULL) home = getenv("APPDATA");
+ if (home == NULL){
+ path = NULL;
+ return;
+ }
+ strcpy(path, home);
+ strcat(path, suffix);
+}
+
+size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+{
+ FILE *config_file;
+ char path[512], line[512];
+ size_t n_arch_prefs = 0;
+ volk_arch_pref_t *prefs = NULL;
+
+ //get the config path
+ volk_get_config_path(path);
+ if (path == NULL) return n_arch_prefs; //no prefs found
+ config_file = fopen(path, "r");
+ if(!config_file) return n_arch_prefs; //no prefs found
+
+ //reset the file pointer and write the prefs into volk_arch_prefs
+ while(fgets(line, sizeof(line), config_file) != NULL)
+ {
+ prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
+ volk_arch_pref_t *p = prefs + n_arch_prefs;
+ if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+ {
+ n_arch_prefs++;
+ }
+ }
+ fclose(config_file);
+ *prefs_res = prefs;
+ return n_arch_prefs;
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..6ab013f26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk_rank_archs.h>
+#include <volk/volk_prefs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
+ #define __popcnt __builtin_popcount
+#else
+ inline unsigned __popcnt(unsigned num)
+ {
+ unsigned pop = 0;
+ while(num)
+ {
+ if (num & 0x1) pop++;
+ num >>= 1;
+ }
+ return pop;
+ }
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+){
+ unsigned int i;
+ for (i = 0; i < n_impls; i++) {
+ if(!strncmp(impl_names[i], impl_name, 20)) {
+ return i;
+ }
+ }
+ //TODO return -1;
+ //something terrible should happen here
+ printf("Volk warning: no arch found, returning generic impl\n");
+ return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+}
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+){
+ size_t i;
+ static volk_arch_pref_t *volk_arch_prefs;
+ static size_t n_arch_prefs = 0;
+ static int prefs_loaded = 0;
+ if(!prefs_loaded) {
+ n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
+ prefs_loaded = 1;
+ }
+
+ //now look for the function name in the prefs list
+ for(i = 0; i < n_arch_prefs; i++)
+ {
+ if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+ {
+ const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ return volk_get_index(impl_names, n_impls, impl_name);
+ }
+ }
+
+ //return the best index with the largest deps
+ size_t best_index_a = 0;
+ size_t best_index_u = 0;
+ int best_value_a = -1;
+ int best_value_u = -1;
+ for(i = 0; i < n_impls; i++)
+ {
+ const signed val = __popcnt(impl_deps[i]);
+ if (alignment[i] && val > best_value_a)
+ {
+ best_index_a = i;
+ best_value_a = val;
+ }
+ if (!alignment[i] && val > best_value_u)
+ {
+ best_index_u = i;
+ best_value_u = val;
+ }
+ }
+
+ //when align and we found a best aligned, use it
+ if (align && best_value_a != -1) return best_index_a;
+
+ //otherwise return the best unaligned
+ return best_index_u;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..b3bf8ff17
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+);
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/
diff --git a/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a_orc_impl.orc b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a_orc_impl.orc
new file mode 100644
index 000000000..fd8915da0
--- /dev/null
+++ b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc
new file mode 100644
index 000000000..76faa936a
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16ic_deinterleave_16i_x2_a_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/volk/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc
new file mode 100644
index 000000000..8db49fd7c
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16ic_deinterleave_real_8i_a_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/volk/orc/volk_16ic_magnitude_16i_a_orc_impl.orc b/volk/orc/volk_16ic_magnitude_16i_a_orc_impl.orc
new file mode 100644
index 000000000..fbaebc46d
--- /dev/null
+++ b/volk/orc/volk_16ic_magnitude_16i_a_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_16ic_magnitude_16i_a_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
new file mode 100644
index 000000000..66fef7d2e
--- /dev/null
+++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -0,0 +1,25 @@
+.function volk_16ic_magnitude_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 reall
+.temp 4 imagl
+.temp 2 reals
+.temp 2 imags
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+
+
+
+splitlw reals, imags, src
+convswl reall, reals
+convswl imagl, imags
+convlf realf, reall
+convlf imagf, imagl
+divf realf, realf, scalar
+divf imagf, imagf, scalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf dst, sumf
diff --git a/volk/orc/volk_16u_byteswap_a_orc_impl.orc b/volk/orc/volk_16u_byteswap_a_orc_impl.orc
new file mode 100644
index 000000000..b96ba84af
--- /dev/null
+++ b/volk/orc/volk_16u_byteswap_a_orc_impl.orc
@@ -0,0 +1,3 @@
+.function volk_16u_byteswap_a_orc_impl
+.dest 2 dst
+swapw dst, dst
diff --git a/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc
new file mode 100644
index 000000000..ea23fc045
--- /dev/null
+++ b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.floatparam 4 scalar
+mulf dst, src1, scalar
diff --git a/volk/orc/volk_32f_s32f_normalize_a_orc_impl.orc b/volk/orc/volk_32f_s32f_normalize_a_orc_impl.orc
new file mode 100644
index 000000000..986fdf665
--- /dev/null
+++ b/volk/orc/volk_32f_s32f_normalize_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_normalize_a_orc_impl
+.source 4 src1
+.floatparam 4 invscalar
+.dest 4 dst
+mulf dst, src1, invscalar
diff --git a/volk/orc/volk_32f_sqrt_32f_a_orc_impl.orc b/volk/orc/volk_32f_sqrt_32f_a_orc_impl.orc
new file mode 100644
index 000000000..f339b1122
--- /dev/null
+++ b/volk/orc/volk_32f_sqrt_32f_a_orc_impl.orc
@@ -0,0 +1,4 @@
+.function volk_32f_sqrt_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+sqrtf dst, src
diff --git a/volk/orc/volk_32f_x2_add_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_add_32f_a_orc_impl.orc
new file mode 100644
index 000000000..450cc6a9e
--- /dev/null
+++ b/volk/orc/volk_32f_x2_add_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_add_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_divide_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_divide_32f_a_orc_impl.orc
new file mode 100644
index 000000000..ee3b61b82
--- /dev/null
+++ b/volk/orc/volk_32f_x2_divide_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_divide_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc
new file mode 100644
index 000000000..b367f3091
--- /dev/null
+++ b/volk/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_32f_x2_dot_prod_32f_a_orc_impl
+.source 4 src1
+.source 4 src2
+.dest 4 dst
+.accumulator 4 accum
+addf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_max_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_max_32f_a_orc_impl.orc
new file mode 100644
index 000000000..725201633
--- /dev/null
+++ b/volk/orc/volk_32f_x2_max_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_max_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_min_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_min_32f_a_orc_impl.orc
new file mode 100644
index 000000000..a71ed8250
--- /dev/null
+++ b/volk/orc/volk_32f_x2_min_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_min_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc
new file mode 100644
index 000000000..c17d539fd
--- /dev/null
+++ b/volk/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc b/volk/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc
new file mode 100644
index 000000000..b3b0f256e
--- /dev/null
+++ b/volk/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_subtract_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
diff --git a/volk/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc b/volk/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc
new file mode 100644
index 000000000..aa82699f5
--- /dev/null
+++ b/volk/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,7 @@
+.function volk_32fc_32f_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 4 src2
+.dest 8 dst
+.temp 8 tmp
+mergelq tmp, src2, src2
+x2 mulf dst, src1, tmp
diff --git a/volk/orc/volk_32fc_magnitude_32f_a_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_a_orc_impl.orc
new file mode 100644
index 000000000..032ab2b1b
--- /dev/null
+++ b/volk/orc/volk_32fc_magnitude_32f_a_orc_impl.orc
@@ -0,0 +1,13 @@
+.function volk_32fc_magnitude_32f_a_orc_impl
+.source 8 src
+.dest 4 dst
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf dst, sumf
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc
new file mode 100644
index 000000000..d3bf78935
--- /dev/null
+++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_32fc_s32f_magnitude_16i_a_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+#.temp 4 maskl
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+#cmpltf maskl, 32768.0, rootf
+#andl maskl, maskl, 0x80000000
+#orl rootf, rootf, maskl
+convfl rootl, rootf
+convsuslw dst, rootl
diff --git a/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc
new file mode 100644
index 000000000..2577e034f
--- /dev/null
+++ b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,18 @@
+.function volk_32fc_s32fc_multiply_32fc_a_orc_impl
+.source 8 src1
+.floatparam 8 scalar
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, scalar
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, scalar
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc
new file mode 100644
index 000000000..cb8a12d81
--- /dev/null
+++ b/volk/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,18 @@
+.function volk_32fc_x2_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, src2
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, src2
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff --git a/volk/orc/volk_32i_x2_and_32i_a_orc_impl.orc b/volk/orc/volk_32i_x2_and_32i_a_orc_impl.orc
new file mode 100644
index 000000000..1845e4654
--- /dev/null
+++ b/volk/orc/volk_32i_x2_and_32i_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_and_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
diff --git a/volk/orc/volk_32i_x2_or_32i_a_orc_impl.orc b/volk/orc/volk_32i_x2_or_32i_a_orc_impl.orc
new file mode 100644
index 000000000..004663f42
--- /dev/null
+++ b/volk/orc/volk_32i_x2_or_32i_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_or_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
diff --git a/volk/orc/volk_8i_convert_16i_a_orc_impl.orc b/volk/orc/volk_8i_convert_16i_a_orc_impl.orc
new file mode 100644
index 000000000..17198bf1e
--- /dev/null
+++ b/volk/orc/volk_8i_convert_16i_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_8i_convert_16i_a_orc_impl
+.source 1 src
+.dest 2 dst
+.temp 2 tmp
+convsbw tmp, src
+shlw dst, tmp, 8
diff --git a/volk/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc b/volk/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc
new file mode 100644
index 000000000..ad54fb1e1
--- /dev/null
+++ b/volk/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc
@@ -0,0 +1,11 @@
+.function volk_8i_s32f_convert_32f_a_orc_impl
+.source 1 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+.temp 2 ssrc
+convsbw ssrc, src
+convswl lsrc, ssrc
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
new file mode 100644
index 000000000..e86dce3f5
--- /dev/null
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -0,0 +1,380 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef INCLUDED_GC_SPU_MACS_H
+#define INCLUDED_GC_SPU_MACS_H
+
+/*
+ * This file contains a set of macros that are generally useful when
+ * coding in SPU assembler
+ *
+ * Note that the multi-instruction macros in here may overwrite
+ * registers 77, 78, and 79 without warning.
+ */
+
+/*
+ * defines for all registers
+ */
+#define r0 $0
+#define r1 $1
+#define r2 $2
+#define r3 $3
+#define r4 $4
+#define r5 $5
+#define r6 $6
+#define r7 $7
+#define r8 $8
+#define r9 $9
+#define r10 $10
+#define r11 $11
+#define r12 $12
+#define r13 $13
+#define r14 $14
+#define r15 $15
+#define r16 $16
+#define r17 $17
+#define r18 $18
+#define r19 $19
+#define r20 $20
+#define r21 $21
+#define r22 $22
+#define r23 $23
+#define r24 $24
+#define r25 $25
+#define r26 $26
+#define r27 $27
+#define r28 $28
+#define r29 $29
+#define r30 $30
+#define r31 $31
+#define r32 $32
+#define r33 $33
+#define r34 $34
+#define r35 $35
+#define r36 $36
+#define r37 $37
+#define r38 $38
+#define r39 $39
+#define r40 $40
+#define r41 $41
+#define r42 $42
+#define r43 $43
+#define r44 $44
+#define r45 $45
+#define r46 $46
+#define r47 $47
+#define r48 $48
+#define r49 $49
+#define r50 $50
+#define r51 $51
+#define r52 $52
+#define r53 $53
+#define r54 $54
+#define r55 $55
+#define r56 $56
+#define r57 $57
+#define r58 $58
+#define r59 $59
+#define r60 $60
+#define r61 $61
+#define r62 $62
+#define r63 $63
+#define r64 $64
+#define r65 $65
+#define r66 $66
+#define r67 $67
+#define r68 $68
+#define r69 $69
+#define r70 $70
+#define r71 $71
+#define r72 $72
+#define r73 $73
+#define r74 $74
+#define r75 $75
+#define r76 $76
+#define r77 $77
+#define r78 $78
+#define r79 $79
+#define r80 $80
+#define r81 $81
+#define r82 $82
+#define r83 $83
+#define r84 $84
+#define r85 $85
+#define r86 $86
+#define r87 $87
+#define r88 $88
+#define r89 $89
+#define r90 $90
+#define r91 $91
+#define r92 $92
+#define r93 $93
+#define r94 $94
+#define r95 $95
+#define r96 $96
+#define r97 $97
+#define r98 $98
+#define r99 $99
+#define r100 $100
+#define r101 $101
+#define r102 $102
+#define r103 $103
+#define r104 $104
+#define r105 $105
+#define r106 $106
+#define r107 $107
+#define r108 $108
+#define r109 $109
+#define r110 $110
+#define r111 $111
+#define r112 $112
+#define r113 $113
+#define r114 $114
+#define r115 $115
+#define r116 $116
+#define r117 $117
+#define r118 $118
+#define r119 $119
+#define r120 $120
+#define r121 $121
+#define r122 $122
+#define r123 $123
+#define r124 $124
+#define r125 $125
+#define r126 $126
+#define r127 $127
+
+
+#define lr r0 // link register
+#define sp r1 // stack pointer
+ // r2 is environment pointer for langs that need it (ALGOL)
+
+#define retval r3 // return values are passed in regs starting at r3
+
+#define arg1 r3 // args are passed in regs starting at r3
+#define arg2 r4
+#define arg3 r5
+#define arg4 r6
+#define arg5 r7
+#define arg6 r8
+#define arg7 r9
+#define arg8 r10
+#define arg9 r11
+#define arg10 r12
+
+// r3 - r74 are volatile (caller saves)
+// r74 - r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog)
+// r80 - r127 are non-volatile (caller-saves)
+
+// scratch registers reserved for use by the macros in this file.
+
+#define _gc_t0 r79
+#define _gc_t1 r78
+#define _gc_t2 r77
+
+/*
+ * ----------------------------------------------------------------
+ * pseudo ops
+ * ----------------------------------------------------------------
+ */
+#define PROC_ENTRY(name) \
+ .text; \
+ .p2align 4; \
+ .global name; \
+ .type name, @function; \
+name:
+
+/*
+ * ----------------------------------------------------------------
+ * aliases for common operations
+ * ----------------------------------------------------------------
+ */
+
+// Move register (even pipe, 2 cycles)
+#define MR(rt, ra) or rt, ra, ra;
+
+// Move register (odd pipe, 4 cycles)
+#define LMR(rt, ra) rotqbyi rt, ra, 0;
+
+// return
+#define RETURN() bi lr;
+
+// hint for a return
+#define HINT_RETURN(ret_label) hbr ret_label, lr;
+
+// return if zero
+#define BRZ_RETURN(rt) biz rt, lr;
+
+// return if not zero
+#define BRNZ_RETURN(rt) binz rt, lr;
+
+// return if halfword zero
+#define BRHZ_RETURN(rt) bihz rt, lr;
+
+// return if halfword not zero
+#define BRHNZ_RETURN(rt) bihnz rt, lr;
+
+
+/*
+ * ----------------------------------------------------------------
+ * modulo like things for constant moduli that are powers of 2
+ * ----------------------------------------------------------------
+ */
+
+// rt = ra & (pow2 - 1)
+#define MODULO(rt, ra, pow2) \
+ andi rt, ra, (pow2)-1;
+
+// rt = pow2 - (ra & (pow2 - 1))
+#define MODULO_NEG(rt, ra, pow2) \
+ andi rt, ra, (pow2)-1; \
+ sfi rt, rt, (pow2);
+
+// rt = ra & -(pow2)
+#define ROUND_DOWN(rt, ra, pow2) \
+ andi rt, ra, -(pow2);
+
+// rt = (ra + (pow2 - 1)) & -(pow2)
+#define ROUND_UP(rt, ra, pow2) \
+ ai rt, ra, (pow2)-1; \
+ andi rt, rt, -(pow2);
+
+/*
+ * ----------------------------------------------------------------
+ * Splat - replicate a particular slot into all slots
+ * Altivec analogs...
+ * ----------------------------------------------------------------
+ */
+
+// replicate byte from slot s [0,15]
+#define VSPLTB(rt, ra, s) \
+ ilh _gc_t0, (s)*0x0101; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate halfword from slot s [0,7]
+#define VSPLTH(rt, ra, s) \
+ ilh _gc_t0, 2*(s)*0x0101 + 0x0001; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate word from slot s [0,3]
+#define VSPLTW(rt, ra, s) \
+ iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \
+ iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate double from slot s [0,1]
+#define VSPLTD(rt, ra, s) \
+ /* sp is always 16-byte aligned */ \
+ cdd _gc_t0, 8(sp); /* 0x10111213 14151617 00010203 04050607 */ \
+ rotqbyi rt, ra, ra, (s) << 3; /* rotate double into preferred slot */ \
+ shufb rt, rt, rt, _gc_t0;
+
+/*
+ * ----------------------------------------------------------------
+ * lots of min/max variations...
+ *
+ * On a slot by slot basis, compute the min or max
+ *
+ * U - unsigned, else signed
+ * B,H,{} - byte, halfword, word
+ * F float
+ * ----------------------------------------------------------------
+ */
+
+#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc;
+#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc;
+
+ // words
+
+#define MIN(rt, ra, rb) \
+ cgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAX(rt, ra, rb) \
+ cgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMIN(rt, ra, rb) \
+ clgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAX(rt, ra, rb) \
+ clgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // bytes
+
+#define MINB(rt, ra, rb) \
+ cgtb _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAXB(rt, ra, rb) \
+ cgtb _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINB(rt, ra, rb) \
+ clgtb _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAXB(rt, ra, rb) \
+ clgtb _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // halfwords
+
+#define MINH(rt, ra, rb) \
+ cgth _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAXH(rt, ra, rb) \
+ cgth _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINH(rt, ra, rb) \
+ clgth _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAXH(rt, ra, rb) \
+ clgth _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // floats
+
+#define FMIN(rt, ra, rb) \
+ fcgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define FMAX(rt, ra, rb) \
+ fcgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the minimum magnitude
+#define FMINMAG(rt, ra, rb) \
+ fcmgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the maximum magnitude
+#define FMAXMAG(rt, ra, rb) \
+ fcmgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+
+#endif /* INCLUDED_GC_SPU_MACS_H */
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
new file mode 100644
index 000000000..8811e6801
--- /dev/null
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -0,0 +1,160 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ vector signed short vec_val = spu_splats(val);
+ vector unsigned short compare;
+ vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
+ vector unsigned short after_and;
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+ after_and = spu_and(compare, ones);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+
+ compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+ after_and = spu_and(compare, ones);
+
+
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+
+ int i = 0;
+ for(i = 0; i < 48; i += 2){
+ bear[i] = i;
+ bear[i + 1] = -i;
+ }
+
+ vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short));
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
new file mode 100644
index 000000000..ea110c8d2
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+ vector signed short sum;
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+ sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+ signed short res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = i;
+ }
+
+ vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short));
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", res[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
new file mode 100644
index 000000000..0097b4f56
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+ vector signed int sum;
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+ sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+ signed short res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = i;
+ }
+
+ vector_sum(&pooh[9], &pooh[9], &bear[3], 30);
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
new file mode 100644
index 000000000..d1c960488
--- /dev/null
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -0,0 +1,222 @@
+#include<spu_intrinsics.h>
+
+
+
+
+void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
+ vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
+ vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+ vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+
+ vector float prod0;
+ qword shuf0;
+ vector float prod1;
+ vector float sign_change;
+ qword summand0;
+ qword summand1;
+ vector float sum;
+
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+ shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+ prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
+ sign_change = spu_xor(prod0, (vector float)sign_changer);
+
+ summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+
+ summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+
+ sum = spu_add((vector float)summand0, (vector float)summand1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+
+
+ prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+ shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+ prod1 = spu_mul(prod0, (vector float)shuf0);
+ sign_change = spu_xor(prod0, (vector float)sign_changer);
+ summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+ summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+ sum = spu_add((vector float)summand0, (vector float)summand1);
+
+
+
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ float pooh[48];
+ float bear[48];
+ float res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = (float) i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = (float) i;
+ }
+
+ vector_product_complex(res, pooh, bear, 48*sizeof(float));
+
+
+
+ for(i = 0; i < 48; ++i) {
+ printf("%f, ", res[i]);
+ }
+ printf("\n");
+
+
+}
+*/
+
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
new file mode 100644
index 000000000..0f15b5d80
--- /dev/null
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -0,0 +1,290 @@
+#include<libvector/libvector_memcpy_unaligned.h
+#include<spu_intrinsics.h>
+
+void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+void* mcpy(void* target, void* src, size_t num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = src;
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
+
+ //alpha: first half of first, second half of second, break at target%16
+ src_cmp = spu_splats((unsigned char)(target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+ printf("num_bytesmod16 %d\n", num_bytes%16);
+ printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
+ spu_extract((vector unsigned char) shuffle_mask_beta, 0),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 1),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 2),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 3),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 4),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 5),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 6),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 7),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 8),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 9),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 10),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 11),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 12),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 13),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 14),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 15));
+
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+
+}
+*/
+/*
+int main(){
+
+ unsigned char pooh[48];
+ unsigned char bear[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ bear[i] = i;
+ }
+
+ spu_mcpy(&pooh[9],&bear[3], 15);
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+
+*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..c260a125c
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+ .file "spu_memset_unaligned.S"
+
+ /*
+ * Computes this, only a lot faster...
+ *
+ * void *
+ * libvector_memset_unaligned(void *pv, int c, size_t n)
+ * {
+ * unsigned char *p = (unsigned char *) pv;
+ * size_t i;
+ * for (i = 0; i < n; i++)
+ * p[i] = c;
+ *
+ * return pv;
+ * }
+ */
+
+#define p_arg arg1 // we're going to clobber arg1 w/ the return value
+#define c arg2 // the constant we're writing
+#define n arg3 // how many bytes to write
+
+#define p r13 // where we're writing
+#define t0 r14
+#define t1 r15
+#define mask r16
+#define old r17
+#define an r18 // aligned n (n rounded down to mod 16 boundary)
+#define next_p r19
+#define cond1 r20
+#define cond2 r21
+#define m r22
+#define r r23
+
+ PROC_ENTRY(libvector_memset_unaligned)
+
+ // Hint the return from do_head, in case we go that way.
+ // There's pretty much nothing to can do to hint the branch to it.
+ hbrr do_head_br, head_complete
+
+ MR(p, p_arg) // leaves p, the return value, in the correct reg (r3)
+ BRZ_RETURN(n)
+
+ MODULO(t0, p, 16) // is p%16 == 0?
+ VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots
+ brnz t0, do_head // no, handle it
+head_complete:
+
+ /*
+ * preconditions:
+ * p%16 == 0, n > 0
+ */
+ hbrr middle_loop_br, middle_loop
+
+ ROUND_DOWN(an, n, 16) // an is "aligned n"
+ MODULO(n, n, 16) // what's left over in the last quad
+ brz an, do_tail // no whole quad words; skip to tail
+ clgti t0, an, 127 // an >= 128?
+ brz t0, middle2 // nope, go handle the cases between 0 and 112
+
+ /*
+ * 128 bytes / iteration
+ */
+ .p2align 4
+middle_loop:
+ ai an, an, -128
+ stqd c, 0*16(p)
+ ai next_p, p, 128
+ stqd c, 1*16(p)
+ cgti cond1, an, 127
+ stqd c, 2*16(p)
+
+ stqd c, 3*16(p)
+ stqd c, 4*16(p)
+ stqd c, 5*16(p)
+ stqd c, 6*16(p)
+
+ MR(p, next_p)
+ stqd c, 7*16-128(next_p)
+ or cond2, n, an
+middle_loop_br:
+ brnz cond1, middle_loop
+
+ /*
+ * if an and n are both zero, return now
+ */
+ BRZ_RETURN(cond2)
+
+ /*
+ * otherwise handle last of full quad words
+ *
+ * 0 <= an < 128, p%16 == 0
+ */
+middle2:
+ /*
+ * if an == 0, go handle the final non-full quadword
+ */
+ brz an, do_tail
+ hbrr middle2_loop_br, middle2_loop
+
+ .p2align 3
+middle2_loop:
+ ai next_p, p, 16
+ stqd c, 0(p)
+ ai an, an, -16
+ LMR(p, next_p)
+middle2_loop_br:
+ brnz an, middle2_loop
+
+ /* We're done with the full quadwords. */
+
+ /*
+ * Handle the final partial quadword.
+ * We'll be modifying only the left hand portion of the quad.
+ *
+ * preconditions:
+ * an == 0, 0 <= n < 16, p%16 == 0
+ */
+do_tail:
+ HINT_RETURN(do_tail_ret)
+ il mask, -1
+ sfi t1, n, 16 // t1 = 16 - n
+ lqd old, 0(p)
+ shlqby mask, mask, t1
+ selb t0, old, c, mask
+ stqd t0, 0(p)
+do_tail_ret:
+ RETURN()
+
+ /*
+ * ----------------------------------------------------------------
+ * Handle the first partial quadword
+ *
+ * preconditions:
+ * p%16 != 0
+ *
+ * postconditions:
+ * p%16 == 0 or n == 0
+ *
+ * |-- m --|
+ * +----------------+----------------+
+ * | //////// | |
+ * +----------------+----------------+
+ * |----- r -----|
+ * p
+ * ----------------------------------------------------------------
+ */
+do_head:
+ lqd old, 0(p)
+ MODULO_NEG(r, p, 16)
+ il mask, -1
+ UMIN(m, r, n)
+ shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom
+ MR(t1, p)
+ sf t0, m, r // t0 = r - m
+ a p, p, m // p += m
+ rotqby mask, mask, t0 // rotate 0's to the right place
+ sf n, m, n // n -= m
+ selb t0, c, old, mask // merge
+ stqd t0, 0(t1)
+ BRZ_RETURN(n)
+do_head_br:
+ br head_complete
diff --git a/volk/tmpl/volk.tmpl.c b/volk/tmpl/volk.tmpl.c
new file mode 100644
index 000000000..f915f157f
--- /dev/null
+++ b/volk/tmpl/volk.tmpl.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_typedefs.h>
+#include <volk/volk_cpu.h>
+#include "volk_rank_archs.h"
+#include <volk/volk.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+static size_t __alignment = 0;
+static intptr_t __alignment_mask = 0;
+
+struct volk_machine *get_machine(void) {
+ extern struct volk_machine *volk_machines[];
+ extern unsigned int n_volk_machines;
+ static struct volk_machine *machine = NULL;
+
+ if(machine != NULL) return machine;
+ else {
+ unsigned int max_score = 0;
+ unsigned int i;
+ for(i=0; i<n_volk_machines; i++) {
+ if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+ if(volk_machines[i]->caps > max_score) {
+ max_score = volk_machines[i]->caps;
+ machine = volk_machines[i];
+ }
+ }
+ }
+ printf("Using Volk machine: %s\n", machine->name);
+ __alignment = machine->alignment;
+ __alignment_mask = (intptr_t)(__alignment-1);
+ return machine;
+ }
+}
+
+size_t volk_get_alignment(void)
+{
+ get_machine(); //ensures alignment is set
+ return __alignment;
+}
+
+bool volk_is_aligned(const void *ptr)
+{
+ return ((intptr_t)(ptr) & __alignment_mask) == 0;
+}
+
+#define LV_HAVE_GENERIC
+#define LV_HAVE_DISPATCHER
+
+#for $kern in $kernels
+
+#if $kern.has_dispatcher
+#include <volk/$(kern.name).h> //pulls in the dispatcher
+#end if
+
+static inline void __$(kern.name)_d($kern.arglist_full)
+{
+ #if $kern.has_dispatcher
+ $(kern.name)_dispatcher($kern.arglist_names);
+ return;
+ #end if
+
+ if (volk_is_aligned(
+ #set $num_open_parens = 0
+ #for $arg_type, $arg_name in $kern.args
+ #if '*' in $arg_type
+ VOLK_OR_PTR($arg_name,
+ #set $num_open_parens += 1
+ #end if
+ #end for
+ 0$(')'*$num_open_parens)
+ )){
+ $(kern.name)_a($kern.arglist_names);
+ }
+ else{
+ $(kern.name)_u($kern.arglist_names);
+ }
+}
+
+static inline void __init_$(kern.name)(void)
+{
+ const char *name = get_machine()->$(kern.name)_name;
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
+ const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
+ $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a];
+ $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u];
+
+ assert($(kern.name)_a);
+ assert($(kern.name)_u);
+
+ $(kern.name) = &__$(kern.name)_d;
+}
+
+static inline void __$(kern.name)_a($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_a($kern.arglist_names);
+}
+
+static inline void __$(kern.name)_u($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_u($kern.arglist_names);
+}
+
+static inline void __$(kern.name)($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)($kern.arglist_names);
+}
+
+$kern.pname $(kern.name)_a = &__$(kern.name)_a;
+$kern.pname $(kern.name)_u = &__$(kern.name)_u;
+$kern.pname $(kern.name) = &__$(kern.name);
+
+void $(kern.name)_manual($kern.arglist_full, const char* impl_name)
+{
+ const int index = volk_get_index(
+ get_machine()->$(kern.name)_impl_names,
+ get_machine()->$(kern.name)_n_impls,
+ impl_name
+ );
+ get_machine()->$(kern.name)_impls[index](
+ $kern.arglist_names
+ );
+}
+
+volk_func_desc_t $(kern.name)_get_func_desc(void) {
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ volk_func_desc_t desc = {
+ impl_names,
+ impl_deps,
+ alignment,
+ n_impls
+ };
+ return desc;
+}
+
+#end for
diff --git a/volk/tmpl/volk.tmpl.h b/volk/tmpl/volk.tmpl.h
new file mode 100644
index 000000000..464b65598
--- /dev/null
+++ b/volk/tmpl/volk.tmpl.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_RUNTIME
+#define INCLUDED_VOLK_RUNTIME
+
+#include <volk/volk_typedefs.h>
+#include <volk/volk_config_fixed.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_func_desc
+{
+ const char **impl_names;
+ const int *impl_deps;
+ const bool *impl_alignment;
+ const size_t n_impls;
+} volk_func_desc_t;
+
+//! Get the machine alignment in bytes
+VOLK_API size_t volk_get_alignment(void);
+
+/*!
+ * The VOLK_OR_PTR macro is a convenience macro
+ * for checking the alignment of a set of pointers.
+ * Example usage:
+ * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
+ */
+#define VOLK_OR_PTR(ptr0, ptr1) \
+ (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
+
+/*!
+ * Is the pointer on a machine alignment boundary?
+ *
+ * Note: for performance reasons, this function
+ * is not usable until another volk API call is made
+ * which will perform certain initialization tasks.
+ *
+ * \param ptr the pointer to some memory buffer
+ * \return 1 for alignment boundary, else 0
+ */
+VOLK_API bool volk_is_aligned(const void *ptr);
+
+#for $kern in $kernels
+
+//! A function pointer to the dispatcher implementation
+extern VOLK_API $kern.pname $kern.name;
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API $kern.pname $(kern.name)_a;
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API $kern.pname $(kern.name)_u;
+
+//! Call into a specific implementation given by name
+extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name);
+
+//! Get description paramaters for this kernel
+extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void);
+#end for
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_RUNTIME*/
diff --git a/volk/tmpl/volk_config_fixed.tmpl.h b/volk/tmpl/volk_config_fixed.tmpl.h
new file mode 100644
index 000000000..e1c01ae77
--- /dev/null
+++ b/volk/tmpl/volk_config_fixed.tmpl.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
+#define INCLUDED_VOLK_CONFIG_FIXED_H
+
+#for $i, $arch in enumerate($archs)
+#define LV_$(arch.name.upper()) $i
+#end for
+
+#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
diff --git a/volk/tmpl/volk_cpu.tmpl.c b/volk/tmpl/volk_cpu.tmpl.c
new file mode 100644
index 000000000..81fc679cb
--- /dev/null
+++ b/volk/tmpl/volk_cpu.tmpl.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_cpu.h>
+#include <volk/volk_config_fixed.h>
+#include <stdlib.h>
+
+struct VOLK_CPU volk_cpu;
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+ #define VOLK_CPU_x86
+#endif
+
+#if defined(VOLK_CPU_x86)
+
+//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
+#if defined(__GNUC__)
+ #if defined(HAVE_CPUID_H)
+ #include <cpuid.h>
+ #else
+ #include "gcc_x86_cpuid.h"
+ #endif
+ #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
+
+ /* Return Intel AVX extended CPU capabilities register.
+ * This function will bomb on non-AVX-capable machines, so
+ * check for AVX capability before executing.
+ */
+ #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4
+ static inline unsigned long long _xgetbv(unsigned int index){
+ unsigned int eax, edx;
+ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+ return ((unsigned long long)edx << 32) | eax;
+ }
+ #define __xgetbv() _xgetbv(0)
+ #else
+ #define __xgetbv() 0
+ #endif
+
+//implement get cpuid for MSVC compilers using __cpuid intrinsic
+#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+ #include <intrin.h>
+ #define cpuid_x86(op, r) __cpuid(((int*)r), op)
+
+ #if defined(_XCR_XFEATURE_ENABLED_MASK)
+ #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+ #else
+ #define __xgetbv() 0
+ #endif
+
+#else
+ #error "A get cpuid for volk is not available on this compiler..."
+#endif //defined(__GNUC__)
+
+#endif //defined(VOLK_CPU_x86)
+
+static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
+#if defined(VOLK_CPU_x86)
+ unsigned int regs[4];
+ cpuid_x86(op, regs);
+ return regs[reg] >> bit & 0x01;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int check_extended_cpuid(unsigned int val) {
+#if defined(VOLK_CPU_x86)
+ unsigned int regs[4];
+ cpuid_x86(0x80000000, regs);
+ return regs[0] >= val;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int get_avx_enabled(void) {
+#if defined(VOLK_CPU_x86)
+ return __xgetbv() & 0x6;
+#else
+ return 0;
+#endif
+}
+
+//neon detection is linux specific
+#if defined(__arm__) && defined(__linux__)
+ #include <asm/hwcap.h>
+ #include <linux/auxvec.h>
+ #include <stdio.h>
+ #define VOLK_CPU_ARM
+#endif
+
+static int has_neon(void){
+#if defined(VOLK_CPU_ARM)
+ FILE *auxvec_f;
+ unsigned long auxvec[2];
+ unsigned int found_neon = 0;
+ auxvec_f = fopen("/proc/self/auxv", "rb");
+ if(!auxvec_f) return 0;
+
+ //so auxv is basically 32b of ID and 32b of value
+ //so it goes like this
+ while(!found_neon && auxvec_f) {
+ fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
+ if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
+ found_neon = 1;
+ }
+
+ fclose(auxvec_f);
+ return found_neon;
+#else
+ return 0;
+#endif
+}
+
+static int has_ppc(void){
+#ifdef __PPC__
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+#for $arch in $archs
+static int i_can_has_$arch.name (void) {
+ #for $check, $params in $arch.checks
+ if ($(check)($(', '.join($params))) == 0) return 0;
+ #end for
+ return 1;
+}
+
+#end for
+
+#if defined(HAVE_FENV_H)
+ #include <fenv.h>
+ static inline void set_float_rounding(void){
+ fesetround(FE_TONEAREST);
+ }
+#elif defined(_MSC_VER)
+ #include <float.h>
+ static inline void set_float_rounding(void){
+ unsigned int cwrd;
+ _controlfp_s(&cwrd, 0, 0);
+ _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
+ }
+#else
+ static inline void set_float_rounding(void){
+ //do nothing
+ }
+#endif
+
+void volk_cpu_init() {
+ #for $arch in $archs
+ volk_cpu.has_$arch.name = &i_can_has_$arch.name;
+ #end for
+ set_float_rounding();
+}
+
+unsigned int volk_get_lvarch() {
+ unsigned int retval = 0;
+ volk_cpu_init();
+ #for $arch in $archs
+ retval += volk_cpu.has_$(arch.name)() << LV_$(arch.name.upper());
+ #end for
+ return retval;
+}
diff --git a/volk/tmpl/volk_cpu.tmpl.h b/volk/tmpl/volk_cpu.tmpl.h
new file mode 100644
index 000000000..4d66512e1
--- /dev/null
+++ b/volk/tmpl/volk_cpu.tmpl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_CPU_H
+#define INCLUDED_VOLK_CPU_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+struct VOLK_CPU {
+ #for $arch in $archs
+ int (*has_$arch.name) ();
+ #end for
+};
+
+extern struct VOLK_CPU volk_cpu;
+
+void volk_cpu_init ();
+unsigned int volk_get_lvarch ();
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_CPU_H*/
diff --git a/volk/tmpl/volk_machine_xxx.tmpl.c b/volk/tmpl/volk_machine_xxx.tmpl.c
new file mode 100644
index 000000000..68d7f3eba
--- /dev/null
+++ b/volk/tmpl/volk_machine_xxx.tmpl.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#set $this_machine = $machine_dict[$args[0]]
+#set $arch_names = $this_machine.arch_names
+
+#for $arch in $this_machine.archs
+#define LV_HAVE_$(arch.name.upper()) 1
+#end for
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_config_fixed.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#for $kern in $kernels
+#include <volk/$(kern.name).h>
+#end for
+
+########################################################################
+#def make_arch_have_list($archs)
+$(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp
+#end def
+
+########################################################################
+#def make_impl_name_list($impls)
+{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_align_list($impls)
+{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_deps_list($impls)
+{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_fcn_list($name, $impls)
+{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp
+#end def
+
+struct volk_machine volk_machine_$(this_machine.name) = {
+ $make_arch_have_list($this_machine.archs),
+ "$this_machine.name",
+ $this_machine.alignment,
+ #for $kern in $kernels
+ #set $impls = $kern.get_impls($arch_names)
+ "$kern.name", ##//kernel name
+ $make_impl_name_list($impls), ##//list of kernel implementations by name
+ $make_impl_deps_list($impls), ##//list of arch dependencies per implementation
+ $make_impl_align_list($impls), ##//alignment required? for each implementation
+ $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation
+ $(len($impls)), ##//number of implementations listed here
+ #end for
+};
diff --git a/volk/tmpl/volk_machines.tmpl.c b/volk/tmpl/volk_machines.tmpl.c
new file mode 100644
index 000000000..57dd03c98
--- /dev/null
+++ b/volk/tmpl/volk_machines.tmpl.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+#include "volk_machines.h"
+
+struct volk_machine *volk_machines[] = {
+#for $machine in $machines
+#ifdef LV_MACHINE_$(machine.name.upper())
+&volk_machine_$(machine.name),
+#endif
+#end for
+};
+
+unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines);
diff --git a/volk/tmpl/volk_machines.tmpl.h b/volk/tmpl/volk_machines.tmpl.h
new file mode 100644
index 000000000..7e11b1079
--- /dev/null
+++ b/volk/tmpl/volk_machines.tmpl.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_LIBVOLK_MACHINES_H
+#define INCLUDED_LIBVOLK_MACHINES_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+struct volk_machine {
+ const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
+ const char *name;
+ const size_t alignment; //the maximum byte alignment required for functions in this library
+ #for $kern in $kernels
+ const char *$(kern.name)_name;
+ const char *$(kern.name)_impl_names[$(len($archs))];
+ const int $(kern.name)_impl_deps[$(len($archs))];
+ const bool $(kern.name)_impl_alignment[$(len($archs))];
+ const $(kern.pname) $(kern.name)_impls[$(len($archs))];
+ const size_t $(kern.name)_n_impls;
+ #end for
+};
+
+#for $machine in $machines
+#ifdef LV_MACHINE_$(machine.name.upper())
+extern struct volk_machine volk_machine_$(machine.name);
+#endif
+#end for
+
+__VOLK_DECL_END
+
+#endif //INCLUDED_LIBVOLK_MACHINES_H
diff --git a/volk/tmpl/volk_typedefs.tmpl.h b/volk/tmpl/volk_typedefs.tmpl.h
new file mode 100644
index 000000000..6f5426965
--- /dev/null
+++ b/volk/tmpl/volk_typedefs.tmpl.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_TYPEDEFS
+#define INCLUDED_VOLK_TYPEDEFS
+
+#include <inttypes.h>
+#include <volk/volk_complex.h>
+
+#for $kern in $kernels
+typedef void (*$(kern.pname))($kern.arglist_types);
+#end for
+
+#endif /*INCLUDED_VOLK_TYPEDEFS*/
diff --git a/volk/volk.pc.in b/volk/volk.pc.in
new file mode 100644
index 000000000..58e976786
--- /dev/null
+++ b/volk/volk.pc.in
@@ -0,0 +1,14 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+LV_CXXFLAGS=@LV_CXXFLAGS@
+
+
+Name: volk
+Description: VOLK: Vector Optimized Library of Kernels
+Requires:
+Version: @VERSION@
+Libs: -lvolk
+Cflags: -I${includedir} ${LV_CXXFLAGS}
+